diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index f81da0537a414..40b295a961e4b 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5787,6 +5787,11 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-treat-scalable-fixed-error-as-warning"); } + // Enable local accessor to shared memory pass for SYCL. + if (isa(JA) && IsSYCL) { + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back("-sycl-enable-local-accessor"); + } // These two are potentially updated by AddClangCLArgs. codegenoptions::DebugInfoKind DebugInfoKind = codegenoptions::NoDebugInfo; bool EmitCodeView = false; diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index 6bf5b1a220be1..9817382f22087 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -78,8 +78,12 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA, const llvm::opt::ArgList &Args) const { // Construct lld command. // The output from ld.lld is an HSA code object file. - ArgStringList LldArgs{"-flavor", "gnu", "--no-undefined", "-shared", - "-plugin-opt=-amdgpu-internalize-symbols"}; + ArgStringList LldArgs{"-flavor", + "gnu", + "--no-undefined", + "-shared", + "-plugin-opt=-amdgpu-internalize-symbols", + "-plugin-opt=-sycl-enable-local-accessor"}; auto &TC = getToolChain(); auto &D = TC.getDriver(); diff --git a/clang/test/Driver/sycl-local-accessor-opt.cpp b/clang/test/Driver/sycl-local-accessor-opt.cpp new file mode 100644 index 0000000000000..855ef09e801e9 --- /dev/null +++ b/clang/test/Driver/sycl-local-accessor-opt.cpp @@ -0,0 +1,11 @@ +/// Check the correct handling of sycl-enable-local-accessor option. + +// REQUIRES: clang-driver + +// RUN: %clang -fsycl -### %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-NO-OPT %s +// CHECK-NO-OPT-NOT: "-sycl-enable-local-accessor" + +// RUN: %clang -fsycl -fsycl-targets=nvptx64-nvidia-cuda -### %s 2>&1 \ +// RUN: | FileCheck %s +// CHECK: "-sycl-enable-local-accessor" diff --git a/llvm/lib/Target/NVPTX/SYCL/LocalAccessorToSharedMemory.h b/llvm/include/llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h similarity index 88% rename from llvm/lib/Target/NVPTX/SYCL/LocalAccessorToSharedMemory.h rename to llvm/include/llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h index d09d2c1e01ca5..845e80db8b43a 100644 --- a/llvm/lib/Target/NVPTX/SYCL/LocalAccessorToSharedMemory.h +++ b/llvm/include/llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h @@ -8,9 +8,9 @@ // // This pass operates on SYCL kernels being compiled to CUDA. It modifies // kernel entry points which take pointers to shared memory and modifies them -// to take offsets into shared memory (represented by a symbol in the shared address -// space). The SYCL runtime is expected to provide offsets rather than pointers -// to these functions. +// to take offsets into shared memory (represented by a symbol in the shared +// address space). The SYCL runtime is expected to provide offsets rather than +// pointers to these functions. // //===----------------------------------------------------------------------===// diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index d5b41e595acdd..e853bba272601 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -56,6 +56,8 @@ add_llvm_component_library(LLVMSYCLLowerIR ESIMDVerifier.cpp MutatePrintfAddrspace.cpp + LocalAccessorToSharedMemory.cpp + ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/SYCLLowerIR ${LLVM_MAIN_SRC_DIR}/projects/vc-intrinsics/GenXIntrinsics/include diff --git a/llvm/lib/Target/NVPTX/SYCL/LocalAccessorToSharedMemory.cpp b/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp similarity index 61% rename from llvm/lib/Target/NVPTX/SYCL/LocalAccessorToSharedMemory.cpp rename to llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp index f2409f569c963..57048c4a23734 100644 --- a/llvm/lib/Target/NVPTX/SYCL/LocalAccessorToSharedMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp @@ -14,92 +14,115 @@ // //===----------------------------------------------------------------------===// -#include "LocalAccessorToSharedMemory.h" -#include "../MCTargetDesc/NVPTXBaseInfo.h" +#include "llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" using namespace llvm; #define DEBUG_TYPE "localaccessortosharedmemory" +static bool EnableLocalAccessor; + +static cl::opt EnableLocalAccessorFlag( + "sycl-enable-local-accessor", cl::Hidden, + cl::desc("Enable local accessor to shared memory optimisation."), + cl::location(EnableLocalAccessor), cl::init(false)); + namespace llvm { void initializeLocalAccessorToSharedMemoryPass(PassRegistry &); -} +} // namespace llvm namespace { class LocalAccessorToSharedMemory : public ModulePass { +private: + enum class ArchType { Cuda, AMDHSA, Unsupported }; + + struct KernelPayload { + KernelPayload(Function *Kernel, MDNode *MD = nullptr) + : Kernel(Kernel), MD(MD){}; + Function *Kernel; + MDNode *MD; + }; + + unsigned SharedASValue = 0; + public: static char ID; LocalAccessorToSharedMemory() : ModulePass(ID) {} bool runOnModule(Module &M) override { + if (!EnableLocalAccessor) + return false; + + auto AT = StringSwitch(M.getTargetTriple().c_str()) + .Case("nvptx64-nvidia-cuda", ArchType::Cuda) + .Case("nvptx-nvidia-cuda", ArchType::Cuda) + .Case("amdgcn-amd-amdhsa", ArchType::AMDHSA) + .Default(ArchType::Unsupported); + // Invariant: This pass is only intended to operate on SYCL kernels being - // compiled to the `nvptx{,64}-nvidia-cuda` triple. - // TODO: make sure that non-SYCL kernels are not impacted. + // compiled to either `nvptx{,64}-nvidia-cuda`, or `amdgcn-amd-amdhsa` + // triples. + if (ArchType::Unsupported == AT) + return false; + if (skipModule(M)) return false; - // Keep track of whether the module was changed. - auto Changed = false; + switch (AT) { + case ArchType::Cuda: + // ADDRESS_SPACE_SHARED = 3, + SharedASValue = 3; + break; + case ArchType::AMDHSA: + // LOCAL_ADDRESS = 3, + SharedASValue = 3; + break; + default: + SharedASValue = 0; + break; + } - // Access `nvvm.annotations` to determine which functions are kernel entry - // points. - auto NvvmMetadata = M.getNamedMetadata("nvvm.annotations"); - if (!NvvmMetadata) + SmallVector Kernels; + SmallVector> NewToOldKernels; + populateKernels(M, Kernels, AT); + if (Kernels.empty()) return false; - for (auto MetadataNode : NvvmMetadata->operands()) { - if (MetadataNode->getNumOperands() != 3) - continue; + // Process the function and if changed, update the metadata. + for (auto K : Kernels) { + auto *NewKernel = processKernel(M, K.Kernel); + if (NewKernel) + NewToOldKernels.push_back(std::make_pair(NewKernel, K)); + } - // NVPTX identifies kernel entry points using metadata nodes of the form: - // !X = !{, !"kernel", i32 1} - const MDOperand &TypeOperand = MetadataNode->getOperand(1); - auto Type = dyn_cast(TypeOperand); - if (!Type) - continue; - // Only process kernel entry points. - if (Type->getString() != "kernel") - continue; + if (NewToOldKernels.empty()) + return false; - // Get a pointer to the entry point function from the metadata. - const MDOperand &FuncOperand = MetadataNode->getOperand(0); - if (!FuncOperand) - continue; - auto FuncConstant = dyn_cast(FuncOperand); - if (!FuncConstant) - continue; - auto Func = dyn_cast(FuncConstant->getValue()); - if (!Func) - continue; + postProcessKernels(NewToOldKernels, AT); - // Process the function and if changed, update the metadata. - auto NewFunc = this->ProcessFunction(M, Func); - if (NewFunc) { - Changed = true; - MetadataNode->replaceOperandWith( - 0, llvm::ConstantAsMetadata::get(NewFunc)); - } - } + return true; + } - return Changed; + virtual llvm::StringRef getPassName() const override { + return "SYCL Local Accessor to Shared Memory"; } - Function *ProcessFunction(Module &M, Function *F) { +private: + Function *processKernel(Module &M, Function *F) { // Check if this function is eligible by having an argument that uses shared // memory. auto UsesLocalMemory = false; for (Function::arg_iterator FA = F->arg_begin(), FE = F->arg_end(); FA != FE; ++FA) { - if (FA->getType()->isPointerTy()) { - UsesLocalMemory = - FA->getType()->getPointerAddressSpace() == ADDRESS_SPACE_SHARED; - } - if (UsesLocalMemory) { + if (FA->getType()->isPointerTy() && + FA->getType()->getPointerAddressSpace() == SharedASValue) { + UsesLocalMemory = true; break; } } @@ -111,9 +134,9 @@ class LocalAccessorToSharedMemory : public ModulePass { // Create a global symbol to CUDA shared memory. auto SharedMemGlobalName = F->getName().str(); SharedMemGlobalName.append("_shared_mem"); - auto SharedMemGlobalType = + auto *SharedMemGlobalType = ArrayType::get(Type::getInt8Ty(M.getContext()), 0); - auto SharedMemGlobal = new GlobalVariable( + auto *SharedMemGlobal = new GlobalVariable( /* Module= */ M, /* Type= */ &*SharedMemGlobalType, /* IsConstant= */ false, @@ -122,7 +145,7 @@ class LocalAccessorToSharedMemory : public ModulePass { /* Name= */ Twine{SharedMemGlobalName}, /* InsertBefore= */ nullptr, /* ThreadLocalMode= */ GlobalValue::NotThreadLocal, - /* AddressSpace= */ ADDRESS_SPACE_SHARED, + /* AddressSpace= */ SharedASValue, /* IsExternallyInitialized= */ false); SharedMemGlobal->setAlignment(Align(4)); @@ -139,7 +162,7 @@ class LocalAccessorToSharedMemory : public ModulePass { for (Function::arg_iterator FA = F->arg_begin(), FE = F->arg_end(); FA != FE; ++FA, ++i) { if (FA->getType()->isPointerTy() && - FA->getType()->getPointerAddressSpace() == ADDRESS_SPACE_SHARED) { + FA->getType()->getPointerAddressSpace() == SharedASValue) { // Replace pointers to shared memory with i32 offsets. Arguments.push_back(Type::getInt32Ty(M.getContext())); ArgumentAttributes.push_back( @@ -178,8 +201,8 @@ class LocalAccessorToSharedMemory : public ModulePass { if (ArgumentReplaced[i]) { // If this argument was replaced, then create a `getelementptr` // instruction that uses it to recreate the pointer that was replaced. - auto InsertBefore = &NF->getEntryBlock().front(); - auto PtrInst = GetElementPtrInst::CreateInBounds( + auto *InsertBefore = &NF->getEntryBlock().front(); + auto *PtrInst = GetElementPtrInst::CreateInBounds( /* PointeeType= */ SharedMemGlobalType, /* Ptr= */ SharedMemGlobal, /* IdxList= */ @@ -191,7 +214,7 @@ class LocalAccessorToSharedMemory : public ModulePass { // Then create a bitcast to make sure the new pointer is the same type // as the old one. This will only ever be a `i8 addrspace(3)*` to `i32 // addrspace(3)*` type of cast. - auto CastInst = new BitCastInst(PtrInst, FA->getType()); + auto *CastInst = new BitCastInst(PtrInst, FA->getType()); CastInst->insertAfter(PtrInst); NewValueForUse = CastInst; } @@ -217,11 +240,85 @@ class LocalAccessorToSharedMemory : public ModulePass { return NF; } - virtual llvm::StringRef getPassName() const { - return "localaccessortosharedmemory"; + void populateCudaKernels(Module &M, SmallVector &Kernels) { + // Access `nvvm.annotations` to determine which functions are kernel entry + // points. + auto *NvvmMetadata = M.getNamedMetadata("nvvm.annotations"); + if (!NvvmMetadata) + return; + + for (auto *MetadataNode : NvvmMetadata->operands()) { + if (MetadataNode->getNumOperands() != 3) + continue; + + // NVPTX identifies kernel entry points using metadata nodes of the form: + // !X = !{, !"kernel", i32 1} + const MDOperand &TypeOperand = MetadataNode->getOperand(1); + auto *Type = dyn_cast(TypeOperand); + if (!Type) + continue; + // Only process kernel entry points. + if (Type->getString() != "kernel") + continue; + + // Get a pointer to the entry point function from the metadata. + const MDOperand &FuncOperand = MetadataNode->getOperand(0); + if (!FuncOperand) + continue; + auto *FuncConstant = dyn_cast(FuncOperand); + if (!FuncConstant) + continue; + auto *Func = dyn_cast(FuncConstant->getValue()); + if (!Func) + continue; + + Kernels.push_back(KernelPayload(Func, MetadataNode)); + } + } + + void populateAMDKernels(Module &M, SmallVector &Kernels) { + for (auto &F : M) { + if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) + Kernels.push_back(KernelPayload(&F)); + } } -}; + void populateKernels(Module &M, SmallVector &Kernels, + ArchType AT) { + switch (AT) { + case ArchType::Cuda: + return populateCudaKernels(M, Kernels); + case ArchType::AMDHSA: + return populateAMDKernels(M, Kernels); + default: + llvm_unreachable("Unsupported arch type."); + } + } + + void postProcessCudaKernels( + SmallVector> &NewToOldKernels) { + for (auto &Pair : NewToOldKernels) { + std::get<1>(Pair).MD->replaceOperandWith( + 0, llvm::ConstantAsMetadata::get(std::get<0>(Pair))); + } + } + + void postProcessAMDKernels( + SmallVector> &NewToOldKernels) {} + + void postProcessKernels( + SmallVector> &NewToOldKernels, + ArchType AT) { + switch (AT) { + case ArchType::Cuda: + return postProcessCudaKernels(NewToOldKernels); + case ArchType::AMDHSA: + return postProcessAMDKernels(NewToOldKernels); + default: + llvm_unreachable("Unsupported arch type."); + } + } +}; } // end anonymous namespace char LocalAccessorToSharedMemory::ID = 0; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 958e8c9e5bc54..aa1f800e5446e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -25,6 +25,8 @@ FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone); FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone); void initializeAMDGPURegBankCombinerPass(PassRegistry &); +void initializeLocalAccessorToSharedMemoryPass(PassRegistry &); + // SI Passes FunctionPass *createGCNDPPCombinePass(); FunctionPass *createSIAnnotateControlFlowPass(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index a2c61f9da8dae..6258b628efe67 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -43,6 +43,7 @@ #include "llvm/InitializePasses.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Passes/PassBuilder.h" +#include "llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/GlobalDCE.h" @@ -378,6 +379,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUResourceUsageAnalysisPass(*PR); initializeGCNNSAReassignPass(*PR); initializeGCNPreRAOptimizationsPass(*PR); + + // SYCL-specific passes, needed here to be available to `opt`. + initializeLocalAccessorToSharedMemoryPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -1034,6 +1038,10 @@ void AMDGPUPassConfig::addIRPasses() { // but EarlyCSE can do neither of them. if (isPassEnabled(EnableScalarIRPasses)) addEarlyCSEOrGVNPass(); + + if (TM.getTargetTriple().getArch() == Triple::amdgcn && + TM.getTargetTriple().getOS() == Triple::OSType::AMDHSA) + addPass(createLocalAccessorToSharedMemoryPass()); } void AMDGPUPassConfig::addCodeGenPrepare() { diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt index c454b13660e2d..e5a4eb43b912d 100644 --- a/llvm/lib/Target/NVPTX/CMakeLists.txt +++ b/llvm/lib/Target/NVPTX/CMakeLists.txt @@ -37,7 +37,6 @@ set(NVPTXCodeGen_sources NVVMReflect.cpp NVPTXProxyRegErasure.cpp SYCL/GlobalOffset.cpp - SYCL/LocalAccessorToSharedMemory.cpp ) add_llvm_target(NVPTXCodeGen @@ -58,6 +57,7 @@ add_llvm_target(NVPTXCodeGen Target TransformUtils Vectorize + Passes ADD_TO_COMPONENT NVPTX diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 5f4661294f18d..99f37aa5c286b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -18,7 +18,6 @@ #include "NVPTXTargetObjectFile.h" #include "NVPTXTargetTransformInfo.h" #include "SYCL/GlobalOffset.h" -#include "SYCL/LocalAccessorToSharedMemory.h" #include "TargetInfo/NVPTXTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Triple.h" @@ -30,6 +29,7 @@ #include "llvm/MC/TargetRegistry.h" #include "llvm/Pass.h" #include "llvm/Passes/PassBuilder.h" +#include "llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" @@ -65,6 +65,7 @@ static cl::opt UseShortPointersOpt( namespace llvm { +void initializeLocalAccessorToSharedMemoryPass(PassRegistry &); void initializeNVVMIntrRangePass(PassRegistry&); void initializeNVVMReflectPass(PassRegistry&); void initializeGenericToNVVMPass(PassRegistry&); diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 660595a380db9..8ec70ea019ba6 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -50,6 +50,7 @@ ; GCN-O0-NEXT: Expand vector predication intrinsics ; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O0-NEXT: Expand reduction intrinsics +; GCN-O0-NEXT: SYCL Local Accessor to Shared Memory ; GCN-O0-NEXT: AMDGPU Attributor ; GCN-O0-NEXT: CallGraph Construction ; GCN-O0-NEXT: Call Graph SCC Pass Manager @@ -216,6 +217,7 @@ ; GCN-O1-NEXT: Expand vector predication intrinsics ; GCN-O1-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-NEXT: Expand reduction intrinsics +; GCN-O1-NEXT: SYCL Local Accessor to Shared Memory ; GCN-O1-NEXT: AMDGPU Attributor ; GCN-O1-NEXT: CallGraph Construction ; GCN-O1-NEXT: Call Graph SCC Pass Manager @@ -487,6 +489,7 @@ ; GCN-O1-OPTS-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-OPTS-NEXT: Expand reduction intrinsics ; GCN-O1-OPTS-NEXT: Early CSE +; GCN-O1-OPTS-NEXT: SYCL Local Accessor to Shared Memory ; GCN-O1-OPTS-NEXT: AMDGPU Attributor ; GCN-O1-OPTS-NEXT: CallGraph Construction ; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager @@ -772,6 +775,7 @@ ; GCN-O2-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O2-NEXT: Expand reduction intrinsics ; GCN-O2-NEXT: Early CSE +; GCN-O2-NEXT: SYCL Local Accessor to Shared Memory ; GCN-O2-NEXT: AMDGPU Attributor ; GCN-O2-NEXT: CallGraph Construction ; GCN-O2-NEXT: Call Graph SCC Pass Manager @@ -1072,6 +1076,7 @@ ; GCN-O3-NEXT: Lazy Block Frequency Analysis ; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Global Value Numbering +; GCN-O3-NEXT: SYCL Local Accessor to Shared Memory ; GCN-O3-NEXT: AMDGPU Attributor ; GCN-O3-NEXT: CallGraph Construction ; GCN-O3-NEXT: Call Graph SCC Pass Manager diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-basic-transformation.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-basic-transformation.ll new file mode 100644 index 0000000000000..41873b5a70bd8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-basic-transformation.ll @@ -0,0 +1,24 @@ +; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s +; ModuleID = 'basic-transformation.bc' +source_filename = "basic-transformation.ll" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; This test checks that the transformation is applied in the basic case. + +; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4 + +; Function Attrs: noinline +define amdgpu_kernel void @_ZTS14example_kernel(i32 addrspace(3)* %a, i32 addrspace(1)* %b, i32 %c) { +; CHECK: define amdgpu_kernel void @_ZTS14example_kernel(i32 %0, i32 addrspace(1)* %b, i32 %c) { +entry: +; CHECK: %1 = getelementptr inbounds [0 x i8], [0 x i8] addrspace(3)* @_ZTS14example_kernel_shared_mem, i32 0, i32 %0 +; CHECK: %a = bitcast i8 addrspace(3)* %1 to i32 addrspace(3)* + %0 = load i32, i32 addrspace(3)* %a +; CHECK: %2 = load i32, i32 addrspace(3)* %a + %1 = load i32, i32 addrspace(1)* %b +; CHECK: %3 = load i32, i32 addrspace(1)* %b + %2 = add i32 %c, %c +; CHECK: %4 = add i32 %c, %c + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-multiple-functions.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-multiple-functions.ll new file mode 100644 index 0000000000000..050491546dd79 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-multiple-functions.ll @@ -0,0 +1,31 @@ +; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s +; ModuleID = 'multiple-functions.bc' +source_filename = "multiple-functions.ll" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; This test checks that the transformation does not break kernels which call other functions. + +; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4 + +define void @_ZTS14other_function(i32 addrspace(3)* %a, i32 addrspace(1)* %b, i32 %c) { +; CHECK: define void @_ZTS14other_function(i32 addrspace(3)* %a, i32 addrspace(1)* %b, i32 %c) { + %1 = load i32, i32 addrspace(3)* %a +; CHECK: %1 = load i32, i32 addrspace(3)* %a + %2 = load i32, i32 addrspace(1)* %b +; CHECK: %2 = load i32, i32 addrspace(1)* %b + %3 = add i32 %c, %c +; CHECK: %3 = add i32 %c, %c + ret void +} + +; Function Attrs: noinline +define amdgpu_kernel void @_ZTS14example_kernel(i32 addrspace(3)* %a, i32 addrspace(1)* %b, i32 %c) { +; CHECK: define amdgpu_kernel void @_ZTS14example_kernel(i32 %0, i32 addrspace(1)* %b, i32 %c) { +entry: +; CHECK: %1 = getelementptr inbounds [0 x i8], [0 x i8] addrspace(3)* @_ZTS14example_kernel_shared_mem, i32 0, i32 %0 +; CHECK: %a = bitcast i8 addrspace(3)* %1 to i32 addrspace(3)* + call void @_ZTS14other_function(i32 addrspace(3)* %a, i32 addrspace(1)* %b, i32 %c) +; CHECK: call void @_ZTS14other_function(i32 addrspace(3)* %a, i32 addrspace(1)* %b, i32 %c) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-no-entry-points.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-no-entry-points.ll new file mode 100644 index 0000000000000..45e1823a6aee5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-no-entry-points.ll @@ -0,0 +1,20 @@ +; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s +; ModuleID = 'no-entry-points.bc' +source_filename = "no-entry-points.ll" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; This test checks that no transformation is applied when there are no entry points. + +; Function Attrs: noinline +define void @_ZTS14example_kernel(i32 addrspace(3)* %a, i32 addrspace(1)* %b, i32 %c) { +; CHECK: define void @_ZTS14example_kernel(i32 addrspace(3)* %a, i32 addrspace(1)* %b, i32 %c) { +entry: + %0 = load i32, i32 addrspace(3)* %a +; CHECK: %0 = load i32, i32 addrspace(3)* %a + %1 = load i32, i32 addrspace(1)* %b +; CHECK: %1 = load i32, i32 addrspace(1)* %b + %2 = add i32 %c, %c +; CHECK: %2 = add i32 %c, %c + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-preserves-types.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-preserves-types.ll new file mode 100644 index 0000000000000..3b39347118a64 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-preserves-types.ll @@ -0,0 +1,32 @@ +; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s +; ModuleID = 'bitcasts.bc' +source_filename = "bitcasts.ll" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; This test checks that the transformation always bitcasts to the correct type. + +; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4 + +; Function Attrs: noinline +define amdgpu_kernel void @_ZTS14example_kernel(i32 addrspace(3)* %a, i64 addrspace(3)* %b, i16 addrspace(3)* %c, i8 addrspace(3)* %d) { +; CHECK: define amdgpu_kernel void @_ZTS14example_kernel(i32 %0, i32 %1, i32 %2, i32 %3) { +entry: +; CHECK: %4 = getelementptr inbounds [0 x i8], [0 x i8] addrspace(3)* @_ZTS14example_kernel_shared_mem, i32 0, i32 %3 +; CHECK: %d = bitcast i8 addrspace(3)* %4 to i8 addrspace(3)* +; CHECK: %5 = getelementptr inbounds [0 x i8], [0 x i8] addrspace(3)* @_ZTS14example_kernel_shared_mem, i32 0, i32 %2 +; CHECK: %c = bitcast i8 addrspace(3)* %5 to i16 addrspace(3)* +; CHECK: %6 = getelementptr inbounds [0 x i8], [0 x i8] addrspace(3)* @_ZTS14example_kernel_shared_mem, i32 0, i32 %1 +; CHECK: %b = bitcast i8 addrspace(3)* %6 to i64 addrspace(3)* +; CHECK: %7 = getelementptr inbounds [0 x i8], [0 x i8] addrspace(3)* @_ZTS14example_kernel_shared_mem, i32 0, i32 %0 +; CHECK: %a = bitcast i8 addrspace(3)* %7 to i32 addrspace(3)* + %0 = load i32, i32 addrspace(3)* %a +; CHECK: %8 = load i32, i32 addrspace(3)* %a + %1 = load i64, i64 addrspace(3)* %b +; CHECK: %9 = load i64, i64 addrspace(3)* %b + %2 = load i16, i16 addrspace(3)* %c +; CHECK: %10 = load i16, i16 addrspace(3)* %c + %3 = load i8, i8 addrspace(3)* %d +; CHECK: %11 = load i8, i8 addrspace(3)* %d + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-triple.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-triple.ll new file mode 100644 index 0000000000000..05247d67e26aa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-triple.ll @@ -0,0 +1,25 @@ +; This test checks that the Local Accessor to Shared Memory pass runs with the +; `amdgcn-amd-amdhsa` triple, but not with `amdgcn-amd-amdpal`. +; RUN: llc -mtriple=amdgcn-amd-amdhsa -sycl-enable-local-accessor < %s | FileCheck --check-prefix=CHECK-VALID %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -sycl-enable-local-accessor < %s | FileCheck --check-prefix=CHECK-INVALID %s + +; ModuleID = 'local-accessor-to-shared-memory-triple.ll' +source_filename = "local-accessor-to-shared-memory-triple.ll" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; CHECK-VALID: .globl _ZTS14example_kernel +; CHECK-VALID: - .args: +; CHECK-VALID-NOT: .address_space: local +; CHECK-VALID-NEXT: .offset: 0 +; CHECK-VALID-NEXT: .size: 4 + +; CHECK-INVALID: amdpal.pipelines: +; CHECK-INVALID-NOT: - .args: + +; Function Attrs: noinline +define amdgpu_kernel void @_ZTS14example_kernel(i32 addrspace(3)* %a) { +entry: + %0 = load i32, i32 addrspace(3)* %a + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-valid-triple.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-valid-triple.ll new file mode 100644 index 0000000000000..e86262ad7f6ce --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-valid-triple.ll @@ -0,0 +1,32 @@ +; This test checks that the Local Accessor to Shared Memory pass runs with the +; `amdgcn-amd-amdhsa` triple and does not if the option is not present. +; RUN: llc -mtriple=amdgcn-amd-amdhsa -sycl-enable-local-accessor < %s | FileCheck --check-prefix=CHECK-OPT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -sycl-enable-local-accessor=true < %s | FileCheck --check-prefix=CHECK-OPT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=CHECK-NO-OPT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -sycl-enable-local-accessor=false < %s | FileCheck --check-prefix=CHECK-NO-OPT %s + +; ModuleID = 'local-accessor-to-shared-memory-valid-triple.ll' +source_filename = "local-accessor-to-shared-memory-valid-triple.ll" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; CHECK-OPT: .globl _ZTS14example_kernel +; CHECK-OPT: - .args: +; CHECK-OPT-NOT: .address_space: local +; CHECK-OPT-NEXT: .offset: 0 +; CHECK-OPT-NEXT: .size: 4 +; CHECK-OPT-NEXT: .value_kind: by_value +; CHECK-NO-OPT: .globl _ZTS14example_kernel +; CHECK-NO-OPT: - .args: +; CHECK-NO-OPT-NEXT: .address_space: local +; CHECK-NO-OPT-NEXT: .name: a +; CHECK-NO-OPT-NEXT: .offset: 0 +; CHECK-NO-OPT-NEXT: .pointee_align: 4 +; CHECK-NO-OPT-NEXT: .size: 4 +; CHECK-NO-OPT-NEXT: .value_kind: dynamic_shared_pointer +; Function Attrs: noinline +define amdgpu_kernel void @_ZTS14example_kernel(i32 addrspace(3)* %a) { +entry: + %0 = load i32, i32 addrspace(3)* %a + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-basic-transformation.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-basic-transformation.ll index 21e16fcdb3ec1..19359bfe458a9 100644 --- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-basic-transformation.ll +++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-basic-transformation.ll @@ -1,4 +1,4 @@ -; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory %s -S -o - | FileCheck %s +; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s ; ModuleID = 'basic-transformation.bc' source_filename = "basic-transformation.ll" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-invalid-triple.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-invalid-triple.ll index 697bbf1f15ee6..72b67d9754a9d 100644 --- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-invalid-triple.ll +++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-invalid-triple.ll @@ -1,6 +1,6 @@ ; This test checks that the Local Accessor to Shared Memory pass does not run with the ; `nvptx64-nvidia-nvcl` triple. -; RUN: llc -march=nvptx64 -mcpu=sm_20 < %s | FileCheck %s +; RUN: llc -march=nvptx64 -mcpu=sm_20 -sycl-enable-local-accessor < %s | FileCheck %s ; CHECK: .param .u64 .ptr .shared .align 4 _ZTS14example_kernel_param_0 ; ModuleID = 'local-accessor-to-shared-memory-invalid-triple.ll' diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-functions.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-functions.ll index 00484c1ffe81b..fe9055d676da2 100644 --- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-functions.ll +++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-functions.ll @@ -1,4 +1,4 @@ -; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory %s -S -o - | FileCheck %s +; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s ; ModuleID = 'multiple-functions.bc' source_filename = "multiple-functions.ll" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-no-entry-points.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-no-entry-points.ll index 62799f05bf134..0f434e3491284 100644 --- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-no-entry-points.ll +++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-no-entry-points.ll @@ -1,4 +1,4 @@ -; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory %s -S -o - | FileCheck %s +; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s ; ModuleID = 'no-entry-points.bc' source_filename = "no-entry-points.ll" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-preserves-types.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-preserves-types.ll index d38d05636ed24..bcd0a03b5b051 100644 --- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-preserves-types.ll +++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-preserves-types.ll @@ -1,4 +1,4 @@ -; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory %s -S -o - | FileCheck %s +; RUN: opt -enable-new-pm=0 -localaccessortosharedmemory -sycl-enable-local-accessor %s -S -o - | FileCheck %s ; ModuleID = 'bitcasts.bc' source_filename = "bitcasts.ll" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-triple.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-triple.ll index f72daa02e5811..edaec0c52d45b 100644 --- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-triple.ll +++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-triple.ll @@ -1,7 +1,7 @@ ; This test checks that the Local Accessor to Shared Memory pass runs with the ; `nvptx64-nvidia-cuda` triple. -; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s | FileCheck --check-prefix=CHECK-VALID %s -; RUN: llc -mtriple=nvptx64-nvidia-nvcl < %s | FileCheck --check-prefix=CHECK-INVALID %s +; RUN: llc -mtriple=nvptx64-nvidia-cuda -sycl-enable-local-accessor < %s | FileCheck --check-prefix=CHECK-VALID %s +; RUN: llc -mtriple=nvptx64-nvidia-nvcl -sycl-enable-local-accessor < %s | FileCheck --check-prefix=CHECK-INVALID %s ; CHECK-VALID: .param .u32 _ZTS14example_kernel_param_0 ; CHECK-INVALID: .param .u64 .ptr .shared .align 4 _ZTS14example_kernel_param_0 diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-valid-triple.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-valid-triple.ll index 6c13984b61a6d..c49e7c5bce550 100644 --- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-valid-triple.ll +++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-valid-triple.ll @@ -1,7 +1,11 @@ ; This test checks that the Local Accessor to Shared Memory pass runs with the ; `nvptx64-nvidia-cuda` triple. -; RUN: llc -march=nvptx64 -mcpu=sm_20 < %s | FileCheck %s -; CHECK: .param .u32 _ZTS14example_kernel_param_0 +; RUN: llc -march=nvptx64 -mcpu=sm_20 -sycl-enable-local-accessor < %s | FileCheck --check-prefix=CHECK-OPT %s +; RUN: llc -march=nvptx64 -mcpu=sm_20 -sycl-enable-local-accessor=true < %s | FileCheck --check-prefix=CHECK-OPT %s +; RUN: llc -march=nvptx64 -mcpu=sm_20 < %s | FileCheck --check-prefix=CHECK-NO-OPT %s +; RUN: llc -march=nvptx64 -mcpu=sm_20 -sycl-enable-local-accessor=false < %s | FileCheck --check-prefix=CHECK-NO-OPT %s +; CHECK-OPT: .param .u32 _ZTS14example_kernel_param_0 +; CHECK-NO-OPT-NOT: .param .u32 _ZTS14example_kernel_param_0 ; ModuleID = 'local-accessor-to-shared-memory-valid-triple.ll' source_filename = "local-accessor-to-shared-memory-valid-triple.ll" diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp index 2c85678f6b6e4..488e8c7d7d05f 100644 --- a/sycl/plugins/hip/pi_hip.hpp +++ b/sycl/plugins/hip/pi_hip.hpp @@ -612,7 +612,22 @@ struct _pi_kernel { void add_local_arg(size_t index, size_t size) { size_t localOffset = this->get_local_size(); - add_arg(index, sizeof(size_t), (const void *)&(localOffset), size); + + // maximum required alignment is the size of the largest vector type + const size_t max_alignment = sizeof(double) * 16; + + // for arguments smaller than the maximum alignment simply align to the + // size of the argument + const size_t alignment = std::min(max_alignment, size); + + // align the argument + size_t alignedLocalOffset = localOffset; + if (localOffset % alignment != 0) { + alignedLocalOffset += alignment - (localOffset % alignment); + } + + add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset), + size + (alignedLocalOffset - localOffset)); } void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) {