diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 83b463c630d71..918b0837098ee 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -644,6 +645,36 @@ static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts); assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); + // If idx is dynamic, then sandwich load with bitcasts. + // ie. VectorTy SubVecTy AccessTy + // <64 x i8> -> <16 x i8> <8 x i16> + // <64 x i8> -> <4 x i128> -> i128 -> <8 x i16> + // Extracting subvector with dynamic index has very large expansion in + // the amdgpu backend. Limit to pow2. + FixedVectorType *VectorTy = AA.Vector.Ty; + TypeSize NumBits = DL.getTypeStoreSize(SubVecTy) * 8u; + uint64_t LoadAlign = cast(Inst)->getAlign().value(); + bool IsAlignedLoad = NumBits <= (LoadAlign * 8u); + unsigned TotalNumElts = VectorTy->getNumElements(); + bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0; + if (!isa(Index) && + llvm::isPowerOf2_32(SubVecTy->getNumElements()) && + IsProperlyDivisible && IsAlignedLoad) { + IntegerType *NewElemTy = Builder.getIntNTy(NumBits); + const unsigned NewNumElts = + DL.getTypeStoreSize(VectorTy) * 8u / NumBits; + const unsigned LShrAmt = llvm::Log2_32(SubVecTy->getNumElements()); + FixedVectorType *BitCastTy = + FixedVectorType::get(NewElemTy, NewNumElts); + Value *BCVal = Builder.CreateBitCast(CurVal, BitCastTy); + Value *NewIdx = Builder.CreateLShr( + Index, ConstantInt::get(Index->getType(), LShrAmt)); + Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx); + Value *BCOut = Builder.CreateBitCast(ExtVal, AccessTy); + Inst->replaceAllUsesWith(BCOut); + return nullptr; + } + Value *SubVec = PoisonValue::get(SubVecTy); for (unsigned K = 0; K < NumLoadedElts; ++K) { Value *CurIdx = diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts-llc.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts-llc.ll new file mode 100644 index 0000000000000..084b7a2d59b2f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts-llc.ll @@ -0,0 +1,497 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck %s --check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GFX12 + +define amdgpu_kernel void @test_bitcast_llc_v128i8_v16i8(ptr addrspace(1) %out, i32 %idx) { +; GFX9-LABEL: test_bitcast_llc_v128i8_v16i8: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s33, s[4:5], 0x8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: s_and_b32 s1, s0, 0xff +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s1, s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s33, s33, s33 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_lshl_b32 s33, s33, 1 +; GFX9-NEXT: s_mov_b32 s2, s0 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s0 +; GFX9-NEXT: s_mov_b32 s6, s0 +; GFX9-NEXT: s_mov_b32 s7, s0 +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s0 +; GFX9-NEXT: s_mov_b32 s10, s0 +; GFX9-NEXT: s_mov_b32 s11, s0 +; GFX9-NEXT: s_mov_b32 s12, s0 +; GFX9-NEXT: s_mov_b32 s13, s0 +; GFX9-NEXT: s_mov_b32 s14, s0 +; GFX9-NEXT: s_mov_b32 s15, s0 +; GFX9-NEXT: s_mov_b32 s16, s0 +; GFX9-NEXT: s_mov_b32 s17, s0 +; GFX9-NEXT: s_mov_b32 s18, s0 +; GFX9-NEXT: s_mov_b32 s19, s0 +; GFX9-NEXT: s_mov_b32 s20, s0 +; GFX9-NEXT: s_mov_b32 s21, s0 +; GFX9-NEXT: s_mov_b32 s22, s0 +; GFX9-NEXT: s_mov_b32 s23, s0 +; GFX9-NEXT: s_mov_b32 s24, s0 +; GFX9-NEXT: s_mov_b32 s25, s0 +; GFX9-NEXT: s_mov_b32 s26, s0 +; GFX9-NEXT: s_mov_b32 s27, s0 +; GFX9-NEXT: s_mov_b32 s28, s0 +; GFX9-NEXT: s_mov_b32 s29, s0 +; GFX9-NEXT: s_mov_b32 s30, s0 +; GFX9-NEXT: s_mov_b32 s31, s0 +; GFX9-NEXT: s_add_i32 s36, s33, 3 +; GFX9-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX9-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX9-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX9-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX9-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX9-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX9-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX9-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GFX9-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GFX9-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GFX9-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GFX9-NEXT: v_mov_b64_e32 v[24:25], s[24:25] +; GFX9-NEXT: v_mov_b64_e32 v[26:27], s[26:27] +; GFX9-NEXT: v_mov_b64_e32 v[28:29], s[28:29] +; GFX9-NEXT: v_mov_b64_e32 v[30:31], s[30:31] +; GFX9-NEXT: s_set_gpr_idx_on s36, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: s_add_i32 s0, s33, 2 +; GFX9-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v34, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: v_mov_b32_e32 v36, 0 +; GFX9-NEXT: s_set_gpr_idx_on s33, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v33, v1 +; GFX9-NEXT: v_mov_b32_e32 v32, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: global_store_dwordx4 v36, v[32:35], s[34:35] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_bitcast_llc_v128i8_v16i8: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s33, s[4:5], 0x8 +; GFX11-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-NEXT: v_mov_b32_e32 v35, 0 +; GFX11-NEXT: s_and_b32 s1, s0, 0xff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_mov_b32 s10, s0 +; GFX11-NEXT: s_mov_b32 s11, s0 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s13, s0 +; GFX11-NEXT: s_mov_b32 s14, s0 +; GFX11-NEXT: s_mov_b32 s15, s0 +; GFX11-NEXT: s_mov_b32 s16, s0 +; GFX11-NEXT: s_mov_b32 s17, s0 +; GFX11-NEXT: s_mov_b32 s18, s0 +; GFX11-NEXT: s_mov_b32 s19, s0 +; GFX11-NEXT: s_mov_b32 s20, s0 +; GFX11-NEXT: s_mov_b32 s21, s0 +; GFX11-NEXT: s_mov_b32 s22, s0 +; GFX11-NEXT: s_mov_b32 s23, s0 +; GFX11-NEXT: s_mov_b32 s24, s0 +; GFX11-NEXT: s_mov_b32 s25, s0 +; GFX11-NEXT: s_mov_b32 s26, s0 +; GFX11-NEXT: s_mov_b32 s27, s0 +; GFX11-NEXT: s_mov_b32 s28, s0 +; GFX11-NEXT: s_mov_b32 s29, s0 +; GFX11-NEXT: s_mov_b32 s30, s0 +; GFX11-NEXT: s_mov_b32 s31, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s33, s33, s33 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-NEXT: s_lshl_b32 s0, s33, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 m0, s0, 3 +; GFX11-NEXT: v_movrels_b32_e32 v34, v0 +; GFX11-NEXT: s_add_i32 m0, s0, 2 +; GFX11-NEXT: v_movrels_b32_e32 v33, v0 +; GFX11-NEXT: s_mov_b32 m0, s0 +; GFX11-NEXT: v_movrels_b32_e32 v32, v1 +; GFX11-NEXT: v_movrels_b32_e32 v31, v0 +; GFX11-NEXT: global_store_b128 v35, v[31:34], s[34:35] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_bitcast_llc_v128i8_v16i8: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b96 s[36:38], s[4:5], 0x0 +; GFX12-NEXT: s_lshl_b32 s0, s0, 8 +; GFX12-NEXT: v_mov_b32_e32 v35, 0 +; GFX12-NEXT: s_and_b32 s1, s0, 0xff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s0, s1, s0 +; GFX12-NEXT: s_and_b32 s1, s0, 0xffff +; GFX12-NEXT: s_lshl_b32 s0, s0, 16 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s0, s1, s0 +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_mov_b32 s5, s0 +; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s0 +; GFX12-NEXT: s_mov_b32 s10, s0 +; GFX12-NEXT: s_mov_b32 s11, s0 +; GFX12-NEXT: s_mov_b32 s12, s0 +; GFX12-NEXT: s_mov_b32 s13, s0 +; GFX12-NEXT: s_mov_b32 s14, s0 +; GFX12-NEXT: s_mov_b32 s15, s0 +; GFX12-NEXT: s_mov_b32 s16, s0 +; GFX12-NEXT: s_mov_b32 s17, s0 +; GFX12-NEXT: s_mov_b32 s18, s0 +; GFX12-NEXT: s_mov_b32 s19, s0 +; GFX12-NEXT: s_mov_b32 s20, s0 +; GFX12-NEXT: s_mov_b32 s21, s0 +; GFX12-NEXT: s_mov_b32 s22, s0 +; GFX12-NEXT: s_mov_b32 s23, s0 +; GFX12-NEXT: s_mov_b32 s24, s0 +; GFX12-NEXT: s_mov_b32 s25, s0 +; GFX12-NEXT: s_mov_b32 s26, s0 +; GFX12-NEXT: s_mov_b32 s27, s0 +; GFX12-NEXT: s_mov_b32 s28, s0 +; GFX12-NEXT: s_mov_b32 s29, s0 +; GFX12-NEXT: s_mov_b32 s30, s0 +; GFX12-NEXT: s_mov_b32 s31, s0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s33, s38, s38 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX12-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX12-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX12-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX12-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX12-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX12-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX12-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX12-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX12-NEXT: s_lshl_b32 s0, s33, 1 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: s_add_co_i32 m0, s0, 3 +; GFX12-NEXT: v_movrels_b32_e32 v34, v0 +; GFX12-NEXT: s_add_co_i32 m0, s0, 2 +; GFX12-NEXT: v_movrels_b32_e32 v33, v0 +; GFX12-NEXT: s_mov_b32 m0, s0 +; GFX12-NEXT: v_movrels_b32_e32 v32, v1 +; GFX12-NEXT: v_movrels_b32_e32 v31, v0 +; GFX12-NEXT: global_store_b128 v35, v[31:34], s[36:37] +; GFX12-NEXT: s_endpgm +entry: + %alloca = freeze <128 x i8> poison + %allocabc = bitcast <128 x i8> %alloca to <8 x i128> + %vec = extractelement <8 x i128> %allocabc, i32 %idx + %vecbc = bitcast i128 %vec to <16 x i8> + store <16 x i8> %vecbc, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @test_bitcast_llc_v64i16_v8i16(ptr addrspace(1) %out, i32 %idx) { +; GFX9-LABEL: test_bitcast_llc_v64i16_v8i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s2, s2, s2 +; GFX9-NEXT: s_lshl_b32 s2, s2, 1 +; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v1, v1 +; GFX9-NEXT: s_add_i32 s3, s2, 3 +; GFX9-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: s_set_gpr_idx_on s3, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: s_add_i32 s2, s2, 2 +; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_bitcast_llc_v64i16_v8i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s2, s2, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s2, s2, 1 +; GFX11-NEXT: s_mov_b32 m0, s2 +; GFX11-NEXT: v_movrels_b32_e32 v1, v1 +; GFX11-NEXT: v_movrels_b32_e32 v0, v0 +; GFX11-NEXT: s_add_i32 m0, s2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_movrels_b32_e32 v3, v0 +; GFX11-NEXT: s_add_i32 m0, s2, 2 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_movrels_b32_e32 v2, v0 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_bitcast_llc_v64i16_v8i16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s2, s2, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b32 s2, s2, 1 +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: v_movrels_b32_e32 v1, v1 +; GFX12-NEXT: v_movrels_b32_e32 v0, v0 +; GFX12-NEXT: s_add_co_i32 m0, s2, 3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_movrels_b32_e32 v3, v0 +; GFX12-NEXT: s_add_co_i32 m0, s2, 2 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: v_movrels_b32_e32 v2, v0 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_endpgm +entry: + %alloca = freeze <64 x i16> poison + %allocabc = bitcast <64 x i16> %alloca to <8 x i128> + %vec = extractelement <8 x i128> %allocabc, i32 %idx + %vecbc = bitcast i128 %vec to <8 x i16> + store <8 x i16> %vecbc, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @test_bitcast_llc_v32i32_v4i32(ptr addrspace(1) %out, i32 %idx) { +; GFX9-LABEL: test_bitcast_llc_v32i32_v4i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s2, s2, s2 +; GFX9-NEXT: s_lshl_b32 s2, s2, 1 +; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v1, v1 +; GFX9-NEXT: s_add_i32 s3, s2, 3 +; GFX9-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: s_set_gpr_idx_on s3, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: s_add_i32 s2, s2, 2 +; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_bitcast_llc_v32i32_v4i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s2, s2, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s2, s2, 1 +; GFX11-NEXT: s_mov_b32 m0, s2 +; GFX11-NEXT: v_movrels_b32_e32 v1, v1 +; GFX11-NEXT: v_movrels_b32_e32 v0, v0 +; GFX11-NEXT: s_add_i32 m0, s2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_movrels_b32_e32 v3, v0 +; GFX11-NEXT: s_add_i32 m0, s2, 2 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_movrels_b32_e32 v2, v0 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_bitcast_llc_v32i32_v4i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s2, s2, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b32 s2, s2, 1 +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: v_movrels_b32_e32 v1, v1 +; GFX12-NEXT: v_movrels_b32_e32 v0, v0 +; GFX12-NEXT: s_add_co_i32 m0, s2, 3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_movrels_b32_e32 v3, v0 +; GFX12-NEXT: s_add_co_i32 m0, s2, 2 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: v_movrels_b32_e32 v2, v0 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_endpgm +entry: + %alloca = freeze <32 x i32> poison + %allocabc = bitcast <32 x i32> %alloca to <8 x i128> + %vec = extractelement <8 x i128> %allocabc, i32 %idx + %vecbc = bitcast i128 %vec to <4 x i32> + store <4 x i32> %vecbc, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @test_bitcast_llc_v16i64_v4i256(ptr addrspace(1) %out, i32 %idx) { +; GFX9-LABEL: test_bitcast_llc_v16i64_v4i256: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s2, s2, s2 +; GFX9-NEXT: s_add_i32 s3, s2, 1 +; GFX9-NEXT: s_add_i32 s3, s3, s3 +; GFX9-NEXT: s_lshl_b32 s3, s3, 1 +; GFX9-NEXT: s_set_gpr_idx_on s3, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v1, v1 +; GFX9-NEXT: s_add_i32 s4, s3, 3 +; GFX9-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: s_add_i32 s5, s3, 2 +; GFX9-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: s_add_i32 s2, s2, s2 +; GFX9-NEXT: s_set_gpr_idx_on s5, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: s_lshl_b32 s2, s2, 1 +; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: s_add_i32 s3, s2, 3 +; GFX9-NEXT: s_set_gpr_idx_on s3, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: s_add_i32 s2, s2, 2 +; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_bitcast_llc_v16i64_v4i256: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s2, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s2, 1 +; GFX11-NEXT: s_add_i32 s2, s2, s2 +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_lshl_b32 s2, s2, 1 +; GFX11-NEXT: s_lshl_b32 s3, s0, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: s_mov_b32 m0, s3 +; GFX11-NEXT: v_movrels_b32_e32 v1, v1 +; GFX11-NEXT: v_movrels_b32_e32 v0, v0 +; GFX11-NEXT: s_add_i32 m0, s3, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_movrels_b32_e32 v3, v0 +; GFX11-NEXT: s_add_i32 m0, s3, 2 +; GFX11-NEXT: v_movrels_b32_e32 v2, v0 +; GFX11-NEXT: s_mov_b32 m0, s2 +; GFX11-NEXT: v_movrels_b32_e32 v5, v1 +; GFX11-NEXT: v_movrels_b32_e32 v4, v0 +; GFX11-NEXT: s_add_i32 m0, s2, 3 +; GFX11-NEXT: v_movrels_b32_e32 v7, v0 +; GFX11-NEXT: s_add_i32 m0, s2, 2 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_movrels_b32_e32 v6, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_bitcast_llc_v16i64_v4i256: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s2, s2, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_co_i32 s3, s2, 1 +; GFX12-NEXT: s_add_co_i32 s2, s2, s2 +; GFX12-NEXT: s_add_co_i32 s3, s3, s3 +; GFX12-NEXT: s_lshl_b32 s2, s2, 1 +; GFX12-NEXT: s_lshl_b32 s3, s3, 1 +; GFX12-NEXT: s_mov_b32 m0, s3 +; GFX12-NEXT: v_movrels_b32_e32 v1, v1 +; GFX12-NEXT: v_movrels_b32_e32 v0, v0 +; GFX12-NEXT: s_add_co_i32 m0, s3, 3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_movrels_b32_e32 v3, v0 +; GFX12-NEXT: s_add_co_i32 m0, s3, 2 +; GFX12-NEXT: v_movrels_b32_e32 v2, v0 +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: v_movrels_b32_e32 v5, v1 +; GFX12-NEXT: v_movrels_b32_e32 v4, v0 +; GFX12-NEXT: s_add_co_i32 m0, s2, 3 +; GFX12-NEXT: v_movrels_b32_e32 v7, v0 +; GFX12-NEXT: s_add_co_i32 m0, s2, 2 +; GFX12-NEXT: v_mov_b32_e32 v8, 0 +; GFX12-NEXT: v_movrels_b32_e32 v6, v0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX12-NEXT: s_endpgm +entry: + %alloca = freeze <16 x i64> poison + %allocabc = bitcast <16 x i64> %alloca to <4 x i256> + %vec = extractelement <4 x i256> %allocabc, i32 %idx + %vecbc = bitcast i256 %vec to <4 x i64> + store <4 x i64> %vecbc, ptr addrspace(1) %out, align 16 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts.ll new file mode 100644 index 0000000000000..d8bb5d70cdc5e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx-bitcasts.ll @@ -0,0 +1,315 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s + +define amdgpu_kernel void @test_bitcast_gen_64i8_v16i8(ptr addrspace(1) %out, i32 %idx) #0 { +; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_64i8_v16i8( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[ALLOCA]] to <4 x i128> +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[IDX]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i128> [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i128 [[TMP2]] to <16 x i8> +; CHECK-NEXT: store <16 x i8> [[TMP3]], ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [4 x [16 x i8]], align 16, addrspace(5) + %gep = getelementptr <16 x i8>, ptr addrspace(5) %alloca, i32 0, i32 %idx + %load = load <16 x i8>, ptr addrspace(5) %gep, align 16 + store <16 x i8> %load, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @test_bitcast_gen_32i16_v8i16(ptr addrspace(1) %out, i32 %idx) #0 { +; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_32i16_v8i16( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <32 x i16> poison +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i16> [[ALLOCA]] to <4 x i128> +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[IDX]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i128> [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i128 [[TMP2]] to <8 x i16> +; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [32 x i16], align 16, addrspace(5) + %gep = getelementptr <8 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %idx + %load = load <8 x i16>, ptr addrspace(5) %gep, align 16 + store <8 x i16> %load, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @test_bitcast_gen_64i8_v32i8(ptr addrspace(1) %out, i32 %idx) #0 { +; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_64i8_v32i8( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[ALLOCA]] to <2 x i256> +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[IDX]], 5 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i256> [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i256 [[TMP2]] to <32 x i8> +; CHECK-NEXT: store <32 x i8> [[TMP3]], ptr addrspace(1) [[OUT]], align 32 +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [2 x [32 x i8]], align 16, addrspace(5) + %gep = getelementptr <32 x i8>, ptr addrspace(5) %alloca, i32 0, i32 %idx + %load = load <32 x i8>, ptr addrspace(5) %gep, align 32 + store <32 x i8> %load, ptr addrspace(1) %out, align 32 + ret void +} + +define amdgpu_kernel void @test_bitcast_gen_64i8_v32i8_align16(ptr addrspace(1) %out, i32 %idx) #0 { +; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_64i8_v32i8_align16( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[IDX]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <32 x i8> poison, i8 [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <32 x i8> [[TMP1]], i8 [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[IDX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <32 x i8> [[TMP4]], i8 [[TMP6]], i64 2 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[IDX]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <32 x i8> [[TMP7]], i8 [[TMP9]], i64 3 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[IDX]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <32 x i8> [[TMP10]], i8 [[TMP12]], i64 4 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[IDX]], 5 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <32 x i8> [[TMP13]], i8 [[TMP15]], i64 5 +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[IDX]], 6 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <32 x i8> [[TMP16]], i8 [[TMP18]], i64 6 +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[IDX]], 7 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <32 x i8> [[TMP19]], i8 [[TMP21]], i64 7 +; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[IDX]], 8 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <32 x i8> [[TMP22]], i8 [[TMP24]], i64 8 +; CHECK-NEXT: [[TMP26:%.*]] = add i32 [[IDX]], 9 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <32 x i8> [[TMP25]], i8 [[TMP27]], i64 9 +; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[IDX]], 10 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <32 x i8> [[TMP28]], i8 [[TMP30]], i64 10 +; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[IDX]], 11 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP32]] +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <32 x i8> [[TMP31]], i8 [[TMP33]], i64 11 +; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[IDX]], 12 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <32 x i8> [[TMP34]], i8 [[TMP36]], i64 12 +; CHECK-NEXT: [[TMP38:%.*]] = add i32 [[IDX]], 13 +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <32 x i8> [[TMP37]], i8 [[TMP39]], i64 13 +; CHECK-NEXT: [[TMP41:%.*]] = add i32 [[IDX]], 14 +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP41]] +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <32 x i8> [[TMP40]], i8 [[TMP42]], i64 14 +; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[IDX]], 15 +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <32 x i8> [[TMP43]], i8 [[TMP45]], i64 15 +; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[IDX]], 16 +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <32 x i8> [[TMP46]], i8 [[TMP48]], i64 16 +; CHECK-NEXT: [[TMP50:%.*]] = add i32 [[IDX]], 17 +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP50]] +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <32 x i8> [[TMP49]], i8 [[TMP51]], i64 17 +; CHECK-NEXT: [[TMP53:%.*]] = add i32 [[IDX]], 18 +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP53]] +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <32 x i8> [[TMP52]], i8 [[TMP54]], i64 18 +; CHECK-NEXT: [[TMP56:%.*]] = add i32 [[IDX]], 19 +; CHECK-NEXT: [[TMP57:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = insertelement <32 x i8> [[TMP55]], i8 [[TMP57]], i64 19 +; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[IDX]], 20 +; CHECK-NEXT: [[TMP60:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = insertelement <32 x i8> [[TMP58]], i8 [[TMP60]], i64 20 +; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[IDX]], 21 +; CHECK-NEXT: [[TMP63:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP62]] +; CHECK-NEXT: [[TMP64:%.*]] = insertelement <32 x i8> [[TMP61]], i8 [[TMP63]], i64 21 +; CHECK-NEXT: [[TMP65:%.*]] = add i32 [[IDX]], 22 +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = insertelement <32 x i8> [[TMP64]], i8 [[TMP66]], i64 22 +; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[IDX]], 23 +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = insertelement <32 x i8> [[TMP67]], i8 [[TMP69]], i64 23 +; CHECK-NEXT: [[TMP71:%.*]] = add i32 [[IDX]], 24 +; CHECK-NEXT: [[TMP72:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP71]] +; CHECK-NEXT: [[TMP73:%.*]] = insertelement <32 x i8> [[TMP70]], i8 [[TMP72]], i64 24 +; CHECK-NEXT: [[TMP74:%.*]] = add i32 [[IDX]], 25 +; CHECK-NEXT: [[TMP75:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP74]] +; CHECK-NEXT: [[TMP76:%.*]] = insertelement <32 x i8> [[TMP73]], i8 [[TMP75]], i64 25 +; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[IDX]], 26 +; CHECK-NEXT: [[TMP78:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP77]] +; CHECK-NEXT: [[TMP79:%.*]] = insertelement <32 x i8> [[TMP76]], i8 [[TMP78]], i64 26 +; CHECK-NEXT: [[TMP80:%.*]] = add i32 [[IDX]], 27 +; CHECK-NEXT: [[TMP81:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP80]] +; CHECK-NEXT: [[TMP82:%.*]] = insertelement <32 x i8> [[TMP79]], i8 [[TMP81]], i64 27 +; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[IDX]], 28 +; CHECK-NEXT: [[TMP84:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP83]] +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <32 x i8> [[TMP82]], i8 [[TMP84]], i64 28 +; CHECK-NEXT: [[TMP86:%.*]] = add i32 [[IDX]], 29 +; CHECK-NEXT: [[TMP87:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP86]] +; CHECK-NEXT: [[TMP88:%.*]] = insertelement <32 x i8> [[TMP85]], i8 [[TMP87]], i64 29 +; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[IDX]], 30 +; CHECK-NEXT: [[TMP90:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP89]] +; CHECK-NEXT: [[TMP91:%.*]] = insertelement <32 x i8> [[TMP88]], i8 [[TMP90]], i64 30 +; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[IDX]], 31 +; CHECK-NEXT: [[TMP93:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP92]] +; CHECK-NEXT: [[TMP94:%.*]] = insertelement <32 x i8> [[TMP91]], i8 [[TMP93]], i64 31 +; CHECK-NEXT: store <32 x i8> [[TMP94]], ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [2 x [32 x i8]], align 16, addrspace(5) + %gep = getelementptr <32 x i8>, ptr addrspace(5) %alloca, i32 0, i32 %idx + %load = load <32 x i8>, ptr addrspace(5) %gep, align 16 + store <32 x i8> %load, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @test_bitcast_gen_16i32_v4i32(ptr addrspace(1) %out, i32 %idx) #0 { +; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_16i32_v4i32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <16 x i32> poison +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i32> [[ALLOCA]] to <4 x i128> +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[IDX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i128> [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i128 [[TMP2]] to <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [16 x i32], align 16, addrspace(5) + %gep = getelementptr <4 x i32>, ptr addrspace(5) %alloca, i32 0, i32 %idx + %load = load <4 x i32>, ptr addrspace(5) %gep, align 16 + store <4 x i32> %load, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @test_bitcast_gen_unaligned_gep(ptr addrspace(1) %out, i32 %idx) #0 { +; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_unaligned_gep( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <32 x i16> poison +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[IDX]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX]], 1 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP22]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[IDX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[TMP6]], i64 2 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[IDX]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i16> [[TMP7]], i16 [[TMP9]], i64 3 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[IDX]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> [[TMP10]], i16 [[TMP12]], i64 4 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[IDX]], 5 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[TMP15]], i64 5 +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[IDX]], 6 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[TMP18]], i64 6 +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[IDX]], 7 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <32 x i16> [[ALLOCA]], i32 [[TMP20]] +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[TMP21]], i64 7 +; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr addrspace(1) [[OUT]], align 1 +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [32 x i16], align 1, addrspace(5) + %gep = getelementptr i16, ptr addrspace(5) %alloca, i32 %idx + %load = load <8 x i16>, ptr addrspace(5) %gep, align 1 + store <8 x i16> %load, ptr addrspace(1) %out, align 1 + ret void +} + +define amdgpu_kernel void @test_bitcast_gen_12i32_v4i32(ptr addrspace(1) %out, i32 %idx) #0 { +; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_12i32_v4i32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <12 x i32> poison +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <12 x i32> [[ALLOCA]] to <3 x i128> +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[IDX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i128> [[TMP0]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i128 [[TMP2]] to <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [12 x i32], align 16, addrspace(5) + %gep = getelementptr <4 x i32>, ptr addrspace(5) %alloca, i32 0, i32 %idx + %load = load <4 x i32>, ptr addrspace(5) %gep, align 16 + store <4 x i32> %load, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @test_bitcast_gen_12i32_v3i32(ptr addrspace(1) %out, i32 %idx) #0 { +; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_12i32_v3i32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <12 x i32> poison +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <12 x i32> [[ALLOCA]], i32 [[IDX]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i32> poison, i32 [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <12 x i32> [[ALLOCA]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <3 x i32> [[TMP1]], i32 [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[IDX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <12 x i32> [[ALLOCA]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x i32> [[TMP4]], i32 [[TMP6]], i64 2 +; CHECK-NEXT: store <3 x i32> [[TMP7]], ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [12 x i32], align 16, addrspace(5) + %gep = getelementptr <3 x i32>, ptr addrspace(5) %alloca, i32 0, i32 %idx + %load = load <3 x i32>, ptr addrspace(5) %gep, align 16 + store <3 x i32> %load, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @test_bitcast_gen_11i32_v8i32(ptr addrspace(1) %out, i32 %idx) #0 { +; CHECK-LABEL: define amdgpu_kernel void @test_bitcast_gen_11i32_v8i32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <11 x i32> poison +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <11 x i32> [[ALLOCA]], i32 [[IDX]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[IDX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <11 x i32> [[ALLOCA]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[IDX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <11 x i32> [[ALLOCA]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP6]], i64 2 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[IDX]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <11 x i32> [[ALLOCA]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP9]], i64 3 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[IDX]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <11 x i32> [[ALLOCA]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP12]], i64 4 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[IDX]], 5 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <11 x i32> [[ALLOCA]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[TMP15]], i64 5 +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[IDX]], 6 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <11 x i32> [[ALLOCA]], i32 [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP18]], i64 6 +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[IDX]], 7 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <11 x i32> [[ALLOCA]], i32 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP21]], i64 7 +; CHECK-NEXT: store <8 x i32> [[TMP22]], ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [11 x i32], align 16, addrspace(5) + %gep = getelementptr <8 x i32>, ptr addrspace(5) %alloca, i32 0, i32 %idx + %load = load <8 x i32>, ptr addrspace(5) %gep, align 16 + store <8 x i32> %load, ptr addrspace(1) %out, align 16 + ret void +}