From 8cfe73ab96e33f5b27cf4d7eccc5ba820037693d Mon Sep 17 00:00:00 2001 From: pvanhout Date: Mon, 5 Feb 2024 14:11:45 +0100 Subject: [PATCH 1/3] [AMDGPU][PromoteAlloca] Support memsets to ptr allocas Fixes #80366 --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 16 ++++++++++++---- .../test/CodeGen/AMDGPU/promote-alloca-memset.ll | 12 ++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 5e73411cae9b70..c1b244f50d93f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -521,10 +521,18 @@ static Value *promoteAllocaUserToVector( // For memset, we don't need to know the previous value because we // currently only allow memsets that cover the whole alloca. Value *Elt = MSI->getOperand(1); - if (DL.getTypeStoreSize(VecEltTy) > 1) { - Value *EltBytes = - Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt); - Elt = Builder.CreateBitCast(EltBytes, VecEltTy); + const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy); + if (BytesPerElt > 1) { + Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt); + + // If the element type of the vector is a pointer, we need to first cast + // to an integer, then use a PtrCast. + if (VecEltTy->isPointerTy()) { + Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8); + Elt = Builder.CreateBitCast(EltBytes, PtrInt); + Elt = Builder.CreateIntToPtr(Elt, VecEltTy); + } else + Elt = Builder.CreateBitCast(EltBytes, VecEltTy); } return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll index 15af1f17e230ec..829e7a1b84e90c 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll @@ -84,4 +84,16 @@ entry: ret void } +define amdgpu_kernel void @memset_ptr_alloca(ptr %out) { +; CHECK-LABEL: @memset_ptr_alloca( +; CHECK-NEXT: store i64 0, ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %alloca = alloca [6 x ptr], align 16, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false) + %load = load i64, ptr addrspace(5) %alloca + store i64 %load, ptr %out + ret void +} + declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg) From 2e2375b1e4f662f8826c7fe7b280265e9d14d946 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Mon, 5 Feb 2024 14:25:23 +0100 Subject: [PATCH 2/3] add more tests --- .../CodeGen/AMDGPU/promote-alloca-memset.ll | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll index 829e7a1b84e90c..0c8d5841f76771 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll @@ -84,8 +84,8 @@ entry: ret void } -define amdgpu_kernel void @memset_ptr_alloca(ptr %out) { -; CHECK-LABEL: @memset_ptr_alloca( +define amdgpu_kernel void @memset_array_ptr_alloca(ptr %out) { +; CHECK-LABEL: @memset_array_ptr_alloca( ; CHECK-NEXT: store i64 0, ptr [[OUT:%.*]], align 8 ; CHECK-NEXT: ret void ; @@ -96,4 +96,31 @@ define amdgpu_kernel void @memset_ptr_alloca(ptr %out) { ret void } +define amdgpu_kernel void @memset_vector_ptr_alloca(ptr %out) { +; CHECK-LABEL: @memset_vector_ptr_alloca( +; CHECK-NEXT: store i64 0, ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %alloca = alloca <6 x ptr>, align 16, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false) + %load = load i64, ptr addrspace(5) %alloca + store i64 %load, ptr %out + ret void +} + +define amdgpu_kernel void @memset_nested_array_ptr_alloca(ptr %out) { +; CHECK-LABEL: @memset_nested_array_ptr_alloca( +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x [3 x ptr]], align 16, addrspace(5) +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false) +; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8 +; CHECK-NEXT: store i64 [[LOAD]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %alloca = alloca [2 x [3 x ptr]], align 16, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false) + %load = load i64, ptr addrspace(5) %alloca + store i64 %load, ptr %out + ret void +} + declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg) From f1d9d1a785102b7e5873e69dd66bb6fdacb85b34 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Mon, 5 Feb 2024 14:31:37 +0100 Subject: [PATCH 3/3] more tests, the sequel --- .../CodeGen/AMDGPU/promote-alloca-memset.ll | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll index 0c8d5841f76771..f1e2737b370ef0 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll @@ -108,8 +108,8 @@ define amdgpu_kernel void @memset_vector_ptr_alloca(ptr %out) { ret void } -define amdgpu_kernel void @memset_nested_array_ptr_alloca(ptr %out) { -; CHECK-LABEL: @memset_nested_array_ptr_alloca( +define amdgpu_kernel void @memset_array_of_array_ptr_alloca(ptr %out) { +; CHECK-LABEL: @memset_array_of_array_ptr_alloca( ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x [3 x ptr]], align 16, addrspace(5) ; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false) ; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8 @@ -123,4 +123,19 @@ define amdgpu_kernel void @memset_nested_array_ptr_alloca(ptr %out) { ret void } +define amdgpu_kernel void @memset_array_of_vec_ptr_alloca(ptr %out) { +; CHECK-LABEL: @memset_array_of_vec_ptr_alloca( +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x <3 x ptr>], align 16, addrspace(5) +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false) +; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8 +; CHECK-NEXT: store i64 [[LOAD]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %alloca = alloca [2 x <3 x ptr>], align 16, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false) + %load = load i64, ptr addrspace(5) %alloca + store i64 %load, ptr %out + ret void +} + declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg)