diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index e54ec4f2b1d72..d028820f16c91 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1762,6 +1762,26 @@ class OpenMPIRBuilder { EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP); + /// Generate a target-task for the target construct + /// + /// \param OutlinedFn The outlined device/target kernel function. + /// \param OutlinedFnID The ooulined function ID. + /// \param EmitTargetCallFallbackCB Call back function to generate host + /// fallback code. + /// \param Args Data structure holding information about the kernel arguments. + /// \param DeviceID Identifier for the device via the 'device' clause. + /// \param RTLoc Source location identifier + /// \param AllocaIP The insertion point to be used for alloca instructions. + /// \param Dependencies Vector of DependData objects holding information of + /// dependencies as specified by the 'depend' clause. + /// \param HasNoWait True if the target construct had 'nowait' on it, false + /// otherwise + InsertPointTy emitTargetTask( + Function *OutlinedFn, Value *OutlinedFnID, + EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, + Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP, + SmallVector &Dependencies, bool HasNoWait); + /// Emit the arguments to be passed to the runtime library based on the /// arrays of base pointers, pointers, sizes, map types, and mappers. If /// ForEndCall, emit map types to be passed for the end of the region instead @@ -2245,6 +2265,8 @@ class OpenMPIRBuilder { /// \param BodyGenCB Callback that will generate the region code. /// \param ArgAccessorFuncCB Callback that will generate accessors /// instructions for passed in target arguments where neccessary + /// \param Dependencies A vector of DependData objects that carry + // dependency information as passed in the depend clause InsertPointTy createTarget(const LocationDescription &Loc, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, @@ -2253,7 +2275,8 @@ class OpenMPIRBuilder { SmallVectorImpl &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, - TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB); + TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, + SmallVector Dependencies = {}); /// Returns __kmpc_for_static_init_* runtime function for the specified /// size \a IVSize and sign \a IVSigned. Will create a distribute call diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index cb4de9c8876dc..9a94c7b4b91fe 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -357,23 +357,23 @@ BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, // This function creates a fake integer value and a fake use for the integer // value. It returns the fake value created. This is useful in modeling the // extra arguments to the outlined functions. -Value *createFakeIntVal(IRBuilder<> &Builder, +Value *createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, - std::stack &ToBeDeleted, + llvm::SmallVectorImpl &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name = "", bool AsPtr = true) { Builder.restoreIP(OuterAllocaIP); Instruction *FakeVal; AllocaInst *FakeValAddr = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr"); - ToBeDeleted.push(FakeValAddr); + ToBeDeleted.push_back(FakeValAddr); if (AsPtr) { FakeVal = FakeValAddr; } else { FakeVal = Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val"); - ToBeDeleted.push(FakeVal); + ToBeDeleted.push_back(FakeVal); } // Generate a fake use of this value @@ -386,7 +386,7 @@ Value *createFakeIntVal(IRBuilder<> &Builder, UseFakeVal = cast(Builder.CreateAdd(FakeVal, Builder.getInt32(10))); } - ToBeDeleted.push(UseFakeVal); + ToBeDeleted.push_back(UseFakeVal); return FakeVal; } @@ -1698,6 +1698,74 @@ void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) { emitTaskyieldImpl(Loc); } +// Processes the dependencies in Dependencies and does the following +// - Allocates space on the stack of an array of DependInfo objects +// - Populates each DependInfo object with relevant information of +// the corresponding dependence. +// - All code is inserted in the entry block of the current function. +static Value *emitTaskDependencies( + OpenMPIRBuilder &OMPBuilder, + SmallVectorImpl &Dependencies) { + // Early return if we have no dependencies to process + if (Dependencies.empty()) + return nullptr; + + // Given a vector of DependData objects, in this function we create an + // array on the stack that holds kmp_dep_info objects corresponding + // to each dependency. This is then passed to the OpenMP runtime. + // For example, if there are 'n' dependencies then the following psedo + // code is generated. Assume the first dependence is on a variable 'a' + // + // \code{c} + // DepArray = alloc(n x sizeof(kmp_depend_info); + // idx = 0; + // DepArray[idx].base_addr = ptrtoint(&a); + // DepArray[idx].len = 8; + // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/ + // ++idx; + // DepArray[idx].base_addr = ...; + // \endcode + + IRBuilderBase &Builder = OMPBuilder.Builder; + Type *DependInfo = OMPBuilder.DependInfo; + Module &M = OMPBuilder.M; + + Value *DepArray = nullptr; + OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); + Builder.SetInsertPoint( + OldIP.getBlock()->getParent()->getEntryBlock().getTerminator()); + + Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size()); + DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr"); + + for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) { + Value *Base = + Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx); + // Store the pointer to the variable + Value *Addr = Builder.CreateStructGEP( + DependInfo, Base, + static_cast(RTLDependInfoFields::BaseAddr)); + Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty()); + Builder.CreateStore(DepValPtr, Addr); + // Store the size of the variable + Value *Size = Builder.CreateStructGEP( + DependInfo, Base, static_cast(RTLDependInfoFields::Len)); + Builder.CreateStore( + Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)), + Size); + // Store the dependency kind + Value *Flags = Builder.CreateStructGEP( + DependInfo, Base, + static_cast(RTLDependInfoFields::Flags)); + Builder.CreateStore( + ConstantInt::get(Builder.getInt8Ty(), + static_cast(Dep.DepKind)), + Flags); + } + Builder.restoreIP(OldIP); + return DepArray; +} + OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, @@ -1742,7 +1810,7 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, OI.ExitBB = TaskExitBB; // Add the thread ID argument. - std::stack ToBeDeleted; + SmallVector ToBeDeleted; OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false)); @@ -1939,10 +2007,8 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; }); } - while (!ToBeDeleted.empty()) { - ToBeDeleted.top()->eraseFromParent(); - ToBeDeleted.pop(); - } + llvm::for_each(llvm::reverse(ToBeDeleted), + [](Instruction *I) { I->eraseFromParent(); }); }; addOutlineInfo(std::move(OI)); @@ -5212,6 +5278,91 @@ static Function *createOutlinedFunction( return Func; } +/// Create an entry point for a target task with the following. +/// It'll have the following signature +/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task) +/// This function is called from emitTargetTask once the +/// code to launch the target kernel has been outlined already. +static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, + IRBuilderBase &Builder, + CallInst *StaleCI) { + Module &M = OMPBuilder.M; + // KernelLaunchFunction is the target launch function, i.e. + // the function that sets up kernel arguments and calls + // __tgt_target_kernel to launch the kernel on the device. + // + Function *KernelLaunchFunction = StaleCI->getCalledFunction(); + + // StaleCI is the CallInst which is the call to the outlined + // target kernel launch function. If there are values that the + // outlined function uses then these are aggregated into a structure + // which is passed as the second argument. If not, then there's + // only one argument, the threadID. So, StaleCI can be + // + // %structArg = alloca { ptr, ptr }, align 8 + // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0 + // store ptr %20, ptr %gep_, align 8 + // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1 + // store ptr %21, ptr %gep_8, align 8 + // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg) + // + // OR + // + // call void @_QQmain..omp_par.1(i32 %global.tid.val6) + OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(), + StaleCI->getIterator()); + LLVMContext &Ctx = StaleCI->getParent()->getContext(); + Type *ThreadIDTy = Type::getInt32Ty(Ctx); + Type *TaskPtrTy = OMPBuilder.TaskPtr; + Type *TaskTy = OMPBuilder.Task; + auto ProxyFnTy = + FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy}, + /* isVarArg */ false); + auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage, + ".omp_target_task_proxy_func", + Builder.GetInsertBlock()->getModule()); + ProxyFn->getArg(0)->setName("thread.id"); + ProxyFn->getArg(1)->setName("task"); + + BasicBlock *EntryBB = + BasicBlock::Create(Builder.getContext(), "entry", ProxyFn); + Builder.SetInsertPoint(EntryBB); + + bool HasShareds = StaleCI->arg_size() > 1; + // TODO: This is a temporary assert to prove to ourselves that + // the outlined target launch function is always going to have + // atmost two arguments if there is any data shared between + // host and device. + assert((!HasShareds || (StaleCI->arg_size() == 2)) && + "StaleCI with shareds should have exactly two arguments."); + if (HasShareds) { + auto *ArgStructAlloca = dyn_cast(StaleCI->getArgOperand(1)); + assert(ArgStructAlloca && + "Unable to find the alloca instruction corresponding to arguments " + "for extracted function"); + auto *ArgStructType = + dyn_cast(ArgStructAlloca->getAllocatedType()); + + AllocaInst *NewArgStructAlloca = + Builder.CreateAlloca(ArgStructType, nullptr, "structArg"); + Value *TaskT = ProxyFn->getArg(1); + Value *ThreadId = ProxyFn->getArg(0); + Value *SharedsSize = + Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); + + Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0); + LoadInst *LoadShared = + Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds); + + Builder.CreateMemCpy( + NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared, + LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize); + + Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca}); + } + Builder.CreateRetVoid(); + return ProxyFn; +} static void emitTargetOutlinedFunction( OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn, @@ -5229,13 +5380,281 @@ static void emitTargetOutlinedFunction( OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true, OutlinedFn, OutlinedFnID); } +OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetTask( + Function *OutlinedFn, Value *OutlinedFnID, + EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, + Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, + SmallVector &Dependencies, + bool HasNoWait) { + + // When we arrive at this function, the target region itself has been + // outlined into the function OutlinedFn. + // So at ths point, for + // -------------------------------------------------- + // void user_code_that_offloads(...) { + // omp target depend(..) map(from:a) map(to:b, c) + // a = b + c + // } + // + // -------------------------------------------------- + // + // we have + // + // -------------------------------------------------- + // + // void user_code_that_offloads(...) { + // %.offload_baseptrs = alloca [3 x ptr], align 8 + // %.offload_ptrs = alloca [3 x ptr], align 8 + // %.offload_mappers = alloca [3 x ptr], align 8 + // ;; target region has been outlined and now we need to + // ;; offload to it via a target task. + // } + // void outlined_device_function(ptr a, ptr b, ptr c) { + // *a = *b + *c + // } + // + // We have to now do the following + // (i) Make an offloading call to outlined_device_function using the OpenMP + // RTL. See 'kernel_launch_function' in the pseudo code below. This is + // emitted by emitKernelLaunch + // (ii) Create a task entry point function that calls kernel_launch_function + // and is the entry point for the target task. See + // '@.omp_target_task_proxy_func in the pseudocode below. + // (iii) Create a task with the task entry point created in (ii) + // + // That is we create the following + // + // void user_code_that_offloads(...) { + // %.offload_baseptrs = alloca [3 x ptr], align 8 + // %.offload_ptrs = alloca [3 x ptr], align 8 + // %.offload_mappers = alloca [3 x ptr], align 8 + // + // %structArg = alloca { ptr, ptr, ptr }, align 8 + // %strucArg[0] = %.offload_baseptrs + // %strucArg[1] = %.offload_ptrs + // %strucArg[2] = %.offload_mappers + // proxy_target_task = @__kmpc_omp_task_alloc(..., + // @.omp_target_task_proxy_func) + // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg)) + // dependencies_array = ... + // ;; if nowait not present + // call @__kmpc_omp_wait_deps(..., dependencies_array) + // call @__kmpc_omp_task_begin_if0(...) + // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr + // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...) + // } + // + // define internal void @.omp_target_task_proxy_func(i32 %thread.id, + // ptr %task) { + // %structArg = alloca {ptr, ptr, ptr} + // %shared_data = load (getelementptr %task, 0, 0) + // mempcy(%structArg, %shared_data, sizeof(structArg)) + // kernel_launch_function(%thread.id, %structArg) + // } + // + // We need the proxy function because the signature of the task entry point + // expected by kmpc_omp_task is always the same and will be different from + // that of the kernel_launch function. + // + // kernel_launch_function is generated by emitKernelLaunch and has the + // always_inline attribute. + // void kernel_launch_function(thread_id, + // structArg) alwaysinline { + // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 + // offload_baseptrs = load(getelementptr structArg, 0, 0) + // offload_ptrs = load(getelementptr structArg, 0, 1) + // offload_mappers = load(getelementptr structArg, 0, 2) + // ; setup kernel_args using offload_baseptrs, offload_ptrs and + // ; offload_mappers + // call i32 @__tgt_target_kernel(..., + // outlined_device_function, + // ptr %kernel_args) + // } + // void outlined_device_function(ptr a, ptr b, ptr c) { + // *a = *b + *c + // } + // + BasicBlock *TargetTaskBodyBB = + splitBB(Builder, /*CreateBranch=*/true, "target.task.body"); + BasicBlock *TargetTaskAllocaBB = + splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca"); + + InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB, + TargetTaskAllocaBB->begin()); + InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin()); + + OutlineInfo OI; + OI.EntryBB = TargetTaskAllocaBB; + OI.OuterAllocaBB = AllocaIP.getBlock(); + + // Add the thread ID argument. + SmallVector ToBeDeleted; + OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( + Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false)); + + Builder.restoreIP(TargetTaskBodyIP); + + // emitKernelLaunch makes the necessary runtime call to offload the kernel. + // We then outline all that code into a separate function + // ('kernel_launch_function' in the pseudo code above). This function is then + // called by the target task proxy function (see + // '@.omp_target_task_proxy_func' in the pseudo code above) + // "@.omp_target_task_proxy_func' is generated by emitTargetTaskProxyFunction + Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID, + EmitTargetCallFallbackCB, Args, DeviceID, + RTLoc, TargetTaskAllocaIP)); + + OI.ExitBB = Builder.saveIP().getBlock(); + OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, + HasNoWait](Function &OutlinedFn) mutable { + assert(OutlinedFn.getNumUses() == 1 && + "there must be a single user for the outlined function"); + + CallInst *StaleCI = cast(OutlinedFn.user_back()); + bool HasShareds = StaleCI->arg_size() > 1; + + Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI); + + LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn + << "\n"); + + Builder.SetInsertPoint(StaleCI); + + // Gather the arguments for emitting the runtime call. + uint32_t SrcLocStrSize; + Constant *SrcLocStr = + getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); + + // @__kmpc_omp_task_alloc + Function *TaskAllocFn = + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc); + + // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID) + // call. + Value *ThreadID = getOrCreateThreadID(Ident); -static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, - OpenMPIRBuilder::InsertPointTy AllocaIP, - Function *OutlinedFn, Constant *OutlinedFnID, - int32_t NumTeams, int32_t NumThreads, - SmallVectorImpl &Args, - OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB) { + // Argument - `sizeof_kmp_task_t` (TaskSize) + // Tasksize refers to the size in bytes of kmp_task_t data structure + // including private vars accessed in task. + // TODO: add kmp_task_t_with_privates (privates) + Value *TaskSize = + Builder.getInt64(M.getDataLayout().getTypeStoreSize(Task)); + + // Argument - `sizeof_shareds` (SharedsSize) + // SharedsSize refers to the shareds array size in the kmp_task_t data + // structure. + Value *SharedsSize = Builder.getInt64(0); + if (HasShareds) { + auto *ArgStructAlloca = dyn_cast(StaleCI->getArgOperand(1)); + assert(ArgStructAlloca && + "Unable to find the alloca instruction corresponding to arguments " + "for extracted function"); + auto *ArgStructType = + dyn_cast(ArgStructAlloca->getAllocatedType()); + assert(ArgStructType && "Unable to find struct type corresponding to " + "arguments for extracted function"); + SharedsSize = + Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); + } + + // Argument - `flags` + // Task is tied iff (Flags & 1) == 1. + // Task is untied iff (Flags & 1) == 0. + // Task is final iff (Flags & 2) == 2. + // Task is not final iff (Flags & 2) == 0. + // A target task is not final and is untied. + Value *Flags = Builder.getInt32(0); + + // Emit the @__kmpc_omp_task_alloc runtime call + // The runtime call returns a pointer to an area where the task captured + // variables must be copied before the task is run (TaskData) + CallInst *TaskData = Builder.CreateCall( + TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags, + /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize, + /*task_func=*/ProxyFn}); + + if (HasShareds) { + Value *Shareds = StaleCI->getArgOperand(1); + Align Alignment = TaskData->getPointerAlignment(M.getDataLayout()); + Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData); + Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment, + SharedsSize); + } + + Value *DepArray = emitTaskDependencies(*this, Dependencies); + + // --------------------------------------------------------------- + // V5.2 13.8 target construct + // If the nowait clause is present, execution of the target task + // may be deferred. If the nowait clause is not present, the target task is + // an included task. + // --------------------------------------------------------------- + // The above means that the lack of a nowait on the target construct + // translates to '#pragma omp task if(0)' + if (!HasNoWait) { + if (DepArray) { + Function *TaskWaitFn = + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps); + Builder.CreateCall( + TaskWaitFn, + {/*loc_ref=*/Ident, /*gtid=*/ThreadID, + /*ndeps=*/Builder.getInt32(Dependencies.size()), + /*dep_list=*/DepArray, + /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0), + /*noalias_dep_list=*/ + ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))}); + } + // Included task. + Function *TaskBeginFn = + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0); + Function *TaskCompleteFn = + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0); + Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData}); + CallInst *CI = nullptr; + if (HasShareds) + CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData}); + else + CI = Builder.CreateCall(ProxyFn, {ThreadID}); + CI->setDebugLoc(StaleCI->getDebugLoc()); + Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData}); + } else if (DepArray) { + // HasNoWait - meaning the task may be deferred. Call + // __kmpc_omp_task_with_deps if there are dependencies, + // else call __kmpc_omp_task + Function *TaskFn = + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps); + Builder.CreateCall( + TaskFn, + {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()), + DepArray, ConstantInt::get(Builder.getInt32Ty(), 0), + ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))}); + } else { + // Emit the @__kmpc_omp_task runtime call to spawn the task + Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task); + Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData}); + } + + StaleCI->eraseFromParent(); + llvm::for_each(llvm::reverse(ToBeDeleted), + [](Instruction *I) { I->eraseFromParent(); }); + }; + addOutlineInfo(std::move(OI)); + + LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n" + << *(Builder.GetInsertBlock()) << "\n"); + LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n" + << *(Builder.GetInsertBlock()->getParent()->getParent()) + << "\n"); + return Builder.saveIP(); +} +static void emitTargetCall( + OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, + OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn, + Constant *OutlinedFnID, int32_t NumTeams, int32_t NumThreads, + SmallVectorImpl &Args, + OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, + SmallVector Dependencies = {}) { OpenMPIRBuilder::TargetDataInfo Info( /*RequiresDevicePointerInfo=*/false, @@ -5272,23 +5691,34 @@ static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Value *DynCGGroupMem = Builder.getInt32(0); bool HasNoWait = false; + bool HasDependencies = Dependencies.size() > 0; + bool RequiresOuterTargetTask = HasNoWait || HasDependencies; OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations, NumTeamsVal, NumThreadsVal, DynCGGroupMem, HasNoWait); - Builder.restoreIP(OMPBuilder.emitKernelLaunch( - Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, - DeviceID, RTLoc, AllocaIP)); + // The presence of certain clauses on the target directive require the + // explicit generation of the target task. + if (RequiresOuterTargetTask) { + Builder.restoreIP(OMPBuilder.emitTargetTask( + OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, DeviceID, + RTLoc, AllocaIP, Dependencies, HasNoWait)); + } else { + Builder.restoreIP(OMPBuilder.emitKernelLaunch( + Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, + DeviceID, RTLoc, AllocaIP)); + } } - OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget( const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, int32_t NumThreads, SmallVectorImpl &Args, GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc, - OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB) { + OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, + SmallVector Dependencies) { + if (!updateToLocation(Loc)) return InsertPointTy(); @@ -5296,12 +5726,18 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget( Function *OutlinedFn; Constant *OutlinedFnID; + // The target region is outlined into its own function. The LLVM IR for + // the target region itself is generated using the callbacks CBFunc + // and ArgAccessorFuncCB emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn, OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB); + + // If we are not on the target device, then we need to generate code + // to make a remote call (offload) to the previously outlined function + // that represents the target region. Do that now. if (!Config.isTargetDevice()) emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams, - NumThreads, Args, GenMapInfoCB); - + NumThreads, Args, GenMapInfoCB, Dependencies); return Builder.saveIP(); } @@ -6422,7 +6858,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc, OI.OuterAllocaBB = &OuterAllocaBB; // Insert fake values for global tid and bound tid. - std::stack ToBeDeleted; + SmallVector ToBeDeleted; InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin()); OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true)); @@ -6437,7 +6873,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc, assert(OutlinedFn.getNumUses() == 1 && "there must be a single user for the outlined function"); CallInst *StaleCI = cast(OutlinedFn.user_back()); - ToBeDeleted.push(StaleCI); + ToBeDeleted.push_back(StaleCI); assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) && "Outlined function must have two or three arguments only"); @@ -6461,10 +6897,9 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc, omp::RuntimeFunction::OMPRTL___kmpc_fork_teams), Args); - while (!ToBeDeleted.empty()) { - ToBeDeleted.top()->eraseFromParent(); - ToBeDeleted.pop(); - } + llvm::for_each(llvm::reverse(ToBeDeleted), + [](Instruction *I) { I->eraseFromParent(); }); + }; if (!Config.isTargetDevice()) diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 6ec4c120c11ea..391bbacc2f6cd 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -682,6 +682,32 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder, return bodyGenStatus; } +static void +buildDependData(std::optional depends, OperandRange dependVars, + LLVM::ModuleTranslation &moduleTranslation, + SmallVectorImpl &dds) { + if (dependVars.empty()) + return; + for (auto dep : llvm::zip(dependVars, depends->getValue())) { + llvm::omp::RTLDependenceKindTy type; + switch ( + cast(std::get<1>(dep)).getValue()) { + case mlir::omp::ClauseTaskDepend::taskdependin: + type = llvm::omp::RTLDependenceKindTy::DepIn; + break; + // The OpenMP runtime requires that the codegen for 'depend' clause for + // 'out' dependency kind must be the same as codegen for 'depend' clause + // with 'inout' dependency. + case mlir::omp::ClauseTaskDepend::taskdependout: + case mlir::omp::ClauseTaskDepend::taskdependinout: + type = llvm::omp::RTLDependenceKindTy::DepInOut; + break; + }; + llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep)); + llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal); + dds.emplace_back(dd); + } +} /// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder. static LogicalResult convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder, @@ -705,28 +731,8 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder, }; SmallVector dds; - if (!taskOp.getDependVars().empty() && taskOp.getDepends()) { - for (auto dep : - llvm::zip(taskOp.getDependVars(), taskOp.getDepends()->getValue())) { - llvm::omp::RTLDependenceKindTy type; - switch ( - cast(std::get<1>(dep)).getValue()) { - case mlir::omp::ClauseTaskDepend::taskdependin: - type = llvm::omp::RTLDependenceKindTy::DepIn; - break; - // The OpenMP runtime requires that the codegen for 'depend' clause for - // 'out' dependency kind must be the same as codegen for 'depend' clause - // with 'inout' dependency. - case mlir::omp::ClauseTaskDepend::taskdependout: - case mlir::omp::ClauseTaskDepend::taskdependinout: - type = llvm::omp::RTLDependenceKindTy::DepInOut; - break; - }; - llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep)); - llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal); - dds.emplace_back(dd); - } - } + buildDependData(taskOp.getDepends(), taskOp.getDependVars(), + moduleTranslation, dds); llvm::OpenMPIRBuilder::InsertPointTy allocaIP = findAllocaInsertPoint(builder, moduleTranslation); @@ -3088,10 +3094,14 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, if (!mapData.IsDeclareTarget[i] && !mapData.IsAMember[i]) kernelInput.push_back(mapData.OriginalValue[i]); } + SmallVector dds; + buildDependData(targetOp.getDepends(), targetOp.getDependVars(), + moduleTranslation, dds); builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget( ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams, - defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB)); + defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB, + dds)); // Remap access operations to declare target reference pointers for the // device, essentially generating extra loadop's as necessary diff --git a/mlir/test/Target/LLVMIR/omptarget-depend.mlir b/mlir/test/Target/LLVMIR/omptarget-depend.mlir new file mode 100644 index 0000000000000..c386342005e5e --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-depend.mlir @@ -0,0 +1,140 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + llvm.func @_QQmain() attributes {fir.bindc_name = "main"} { + %0 = llvm.mlir.constant(39 : index) : i64 + %1 = llvm.mlir.constant(0 : index) : i64 + %2 = llvm.mlir.constant(1 : index) : i64 + %3 = llvm.mlir.constant(40 : index) : i64 + %4 = llvm.mlir.addressof @_QFEa : !llvm.ptr + %5 = llvm.mlir.addressof @_QFEb : !llvm.ptr + %6 = llvm.mlir.constant(1 : i64) : i64 + %7 = llvm.alloca %6 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr + %8 = llvm.mlir.addressof @_QFEn : !llvm.ptr + omp.task { + %14 = llvm.mlir.constant(1 : i64) : i64 + %15 = llvm.alloca %14 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr + %16 = llvm.load %8 : !llvm.ptr -> i32 + %17 = llvm.sext %16 : i32 to i64 + %18 = llvm.trunc %2 : i64 to i32 + llvm.br ^bb1(%18, %17 : i32, i64) + ^bb1(%19: i32, %20: i64): // 2 preds: ^bb0, ^bb2 + %21 = llvm.icmp "sgt" %20, %1 : i64 + llvm.cond_br %21, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + llvm.store %19, %15 : i32, !llvm.ptr + %22 = llvm.load %15 : !llvm.ptr -> i32 + %23 = llvm.sext %22 : i32 to i64 + %24 = llvm.mlir.constant(1 : i64) : i64 + %25 = llvm.mlir.constant(0 : i64) : i64 + %26 = llvm.sub %23, %24 overflow : i64 + %27 = llvm.mul %26, %24 overflow : i64 + %28 = llvm.mul %27, %24 overflow : i64 + %29 = llvm.add %28, %25 overflow : i64 + %30 = llvm.mul %24, %3 overflow : i64 + %31 = llvm.getelementptr %4[%29] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + llvm.store %22, %31 : i32, !llvm.ptr + %32 = llvm.load %15 : !llvm.ptr -> i32 + %33 = llvm.add %32, %18 : i32 + %34 = llvm.sub %20, %2 : i64 + llvm.br ^bb1(%33, %34 : i32, i64) + ^bb3: // pred: ^bb1 + llvm.store %19, %15 : i32, !llvm.ptr + omp.terminator + } + %9 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%0 : i64) extent(%3 : i64) stride(%2 : i64) start_idx(%2 : i64) + %10 = omp.map.info var_ptr(%4 : !llvm.ptr, !llvm.array<40 x i32>) map_clauses(to) capture(ByRef) bounds(%9) -> !llvm.ptr {name = "a"} + %11 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.array<40 x i32>) map_clauses(from) capture(ByRef) bounds(%9) -> !llvm.ptr {name = "b"} + %12 = omp.map.info var_ptr(%7 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"} + %13 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "n"} + omp.target map_entries(%10 -> %arg0, %11 -> %arg1, %12 -> %arg2, %13 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) depend(taskdependin -> %4 : !llvm.ptr) { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: !llvm.ptr): + %14 = llvm.mlir.constant(0 : index) : i64 + %15 = llvm.mlir.constant(10 : i32) : i32 + %16 = llvm.mlir.constant(1 : index) : i64 + %17 = llvm.mlir.constant(40 : index) : i64 + %18 = llvm.load %arg3 : !llvm.ptr -> i32 + %19 = llvm.sext %18 : i32 to i64 + %20 = llvm.trunc %16 : i64 to i32 + llvm.br ^bb1(%20, %19 : i32, i64) + ^bb1(%21: i32, %22: i64): // 2 preds: ^bb0, ^bb2 + %23 = llvm.icmp "sgt" %22, %14 : i64 + llvm.cond_br %23, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + llvm.store %21, %arg2 : i32, !llvm.ptr + %24 = llvm.load %arg2 : !llvm.ptr -> i32 + %25 = llvm.sext %24 : i32 to i64 + %26 = llvm.mlir.constant(1 : i64) : i64 + %27 = llvm.mlir.constant(0 : i64) : i64 + %28 = llvm.sub %25, %26 overflow : i64 + %29 = llvm.mul %28, %26 overflow : i64 + %30 = llvm.mul %29, %26 overflow : i64 + %31 = llvm.add %30, %27 overflow : i64 + %32 = llvm.mul %26, %17 overflow : i64 + %33 = llvm.getelementptr %arg0[%31] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + %34 = llvm.load %33 : !llvm.ptr -> i32 + %35 = llvm.add %34, %15 : i32 + %36 = llvm.mlir.constant(1 : i64) : i64 + %37 = llvm.mlir.constant(0 : i64) : i64 + %38 = llvm.sub %25, %36 overflow : i64 + %39 = llvm.mul %38, %36 overflow : i64 + %40 = llvm.mul %39, %36 overflow : i64 + %41 = llvm.add %40, %37 overflow : i64 + %42 = llvm.mul %36, %17 overflow : i64 + %43 = llvm.getelementptr %arg1[%41] : (!llvm.ptr, i64) -> !llvm.ptr, i32 + llvm.store %35, %43 : i32, !llvm.ptr + %44 = llvm.load %arg2 : !llvm.ptr -> i32 + %45 = llvm.add %44, %20 : i32 + %46 = llvm.sub %22, %16 : i64 + llvm.br ^bb1(%45, %46 : i32, i64) + ^bb3: // pred: ^bb1 + llvm.store %21, %arg2 : i32, !llvm.ptr + omp.terminator + } + llvm.return + } + llvm.mlir.global internal @_QFEa() {addr_space = 0 : i32} : !llvm.array<40 x i32> { + %0 = llvm.mlir.zero : !llvm.array<40 x i32> + llvm.return %0 : !llvm.array<40 x i32> + } + llvm.mlir.global internal @_QFEb() {addr_space = 0 : i32} : !llvm.array<40 x i32> { + %0 = llvm.mlir.zero : !llvm.array<40 x i32> + llvm.return %0 : !llvm.array<40 x i32> + } + llvm.mlir.global internal @_QFEc() {addr_space = 0 : i32} : !llvm.array<40 x i32> { + %0 = llvm.mlir.zero : !llvm.array<40 x i32> + llvm.return %0 : !llvm.array<40 x i32> + } + llvm.mlir.global internal @_QFEn() {addr_space = 0 : i32} : i32 { + %0 = llvm.mlir.constant(40 : i32) : i32 + llvm.return %0 : i32 + } + llvm.func @_FortranAProgramStart(i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"} + llvm.func @_FortranAProgramEndStatement() attributes {sym_visibility = "private"} + llvm.func @main(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr) -> i32 { + %0 = llvm.mlir.constant(0 : i32) : i32 + %1 = llvm.mlir.zero : !llvm.ptr + llvm.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %1) {fastmathFlags = #llvm.fastmath} : (i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> () + llvm.call @_QQmain() {fastmathFlags = #llvm.fastmath} : () -> () + llvm.call @_FortranAProgramEndStatement() {fastmathFlags = #llvm.fastmath} : () -> () + llvm.return %0 : i32 + } + +// %strucArg holds pointers to shared data. +// CHECK: define void @_QQmain() { +// CHECK-DAG: %[[STRUCTARG:.+]] = alloca { ptr, ptr, ptr }, align 8 +// CHECK-DAG: %[[DEP_ARRAY:.+]] = alloca [1 x %struct.kmp_dep_info], align 8 +// CHECK: %[[DEP_INFO:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARRAY]], i64 0, i64 0 +// CHECK: %[[PTR0:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 0 +// CHECK: store i64 ptrtoint (ptr @_QFEa to i64), ptr %[[PTR0]], align 4 +// CHECK: %[[PTR1:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 1 +// CHECK: store i64 8, ptr %[[PTR1]], align 4 +// CHECK: %[[PTR2:.+]] = getelementptr inbounds %struct.kmp_dep_info, ptr %[[DEP_INFO]], i32 0, i32 2 +// CHECK: store i8 1, ptr %[[PTR2]], align 1 + +// CHECK: %[[TASKDATA:.+]] = call ptr @__kmpc_omp_task_alloc({{.+}}, ptr @.omp_target_task_proxy_func) +// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[TASKDATA]], align 8 +// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHARED_DATA]], ptr align 1 %[[STRUCTARG]], i64 24, i1 false) +// CHECK: call void @__kmpc_omp_wait_deps({{.+}}, i32 1, ptr %[[DEP_ARRAY]], i32 0, ptr null) +// CHECK: call void @__kmpc_omp_task_begin_if0({{.+}}, ptr %[[TASKDATA]]) +// CHECK: call void @.omp_target_task_proxy_func({{.+}}, ptr %[[TASKDATA]]) +// CHECK: call void @__kmpc_omp_task_complete_if0({{.+}}, ptr %[[TASKDATA]]) + diff --git a/offload/test/offloading/fortran/target-depend.f90 b/offload/test/offloading/fortran/target-depend.f90 new file mode 100644 index 0000000000000..928eb671c9706 --- /dev/null +++ b/offload/test/offloading/fortran/target-depend.f90 @@ -0,0 +1,69 @@ +! Offloading test checking the use of the depend clause on +! the target construct +! REQUIRES: flang, amdgcn-amd-amdhsa +! UNSUPPORTED: nvptx64-nvidia-cuda +! UNSUPPORTED: nvptx64-nvidia-cuda-LTO +! UNSUPPORTED: aarch64-unknown-linux-gnu +! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +! UNSUPPORTED: x86_64-pc-linux-gnu +! UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + implicit none + integer :: a = 0 + INTERFACE + FUNCTION omp_get_device_num() BIND(C) + USE, INTRINSIC :: iso_c_binding, ONLY: C_INT + integer :: omp_get_device_num + END FUNCTION omp_get_device_num + END INTERFACE + + call foo(5, a) + print*, "======= FORTRAN Test passed! =======" + print*, "foo(5) returned ", a, ", expected 6\n" + + ! stop 0 + contains + subroutine foo(N, r) + integer, intent(in) :: N + integer, intent(out) :: r + integer :: z, i, accumulator + z = 1 + accumulator = 0 + ! Spawn 3 threads + !$omp parallel num_threads(3) + + ! A single thread will then create two tasks - one is the 'producer' and + ! potentially slower task that updates 'z' to 'N'. The second is an + ! offloaded target task that increments 'z'. If the depend clauses work + ! properly, the target task should wait for the 'producer' task to + ! complete before incrementing 'z'. We use 'omp single' here because the + ! depend clause establishes dependencies between sibling tasks only. + ! This is the easiest way of creating two sibling tasks. + !$omp single + !$omp task depend(out: z) shared(z) + do i=1, 32766 + ! dumb loop nest to slow down the update of 'z'. + ! Adding a function call slows down the producer to the point + ! that removing the depend clause from the target construct below + ! frequently results in the wrong answer. + accumulator = accumulator + omp_get_device_num() + end do + z = N + !$omp end task + + ! z is 5 now. Increment z to 6. + !$omp target map(tofrom: z) depend(in:z) + z = z + 1 + !$omp end target + !$omp end single + !$omp end parallel + ! Use 'accumulator' so it is not optimized away by the compiler. + print *, accumulator + r = z + end subroutine foo + +!CHECK: ======= FORTRAN Test passed! ======= +!CHECK: foo(5) returned 6 , expected 6 +end program main