From 29ce6d6ba933d5da541c6760137ed0889df88790 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 1 Apr 2020 22:31:17 -0700 Subject: [PATCH] [SYCL] Share PFWG lambda object through shared memory In the current implementation private address of the PFWG lambda object is shared by leader work item through local memory to other work items. This is not correct. That is why perform the copy of the PFWG lambda object to shared memory and make work items work with address of the object in shared memory. I.e. this case should be handled in the similar way as for byval parameters. Signed-off-by: Artur Gainullin --- llvm/lib/SYCLLowerIR/LowerWGScope.cpp | 107 ++++++++++++++++--------- llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll | 42 +++++----- 2 files changed, 94 insertions(+), 55 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp index 197d01447b9a4..b3b41b0c48863 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp @@ -375,20 +375,29 @@ using LocalsSet = SmallPtrSet; static void copyBetweenPrivateAndShadow(Value *L, GlobalVariable *Shadow, IRBuilder<> &Builder, bool Loc2Shadow) { Type *T = nullptr; - int LocAlignN = 0; + MaybeAlign LocAlign(0); if (const auto *AI = dyn_cast(L)) { T = AI->getAllocatedType(); - LocAlignN = AI->getAlignment(); + LocAlign = MaybeAlign(AI->getAlignment()); } else { - T = cast(L)->getParamByValType(); - LocAlignN = cast(L)->getParamAlignment(); + if (cast(L)->hasByValAttr()) { + T = cast(L)->getParamByValType(); + LocAlign = MaybeAlign(cast(L)->getParamAlignment()); + } else { + Type *Ty = cast(L)->getType(); + Module &M = *Shadow->getParent(); + LocAlign = M.getDataLayout().getValueOrABITypeAlignment( + MaybeAlign(cast(L)->getParamAlignment()), Ty); + auto PtrTy = dyn_cast(cast(L)->getType()); + assert(PtrTy && "Expected pointer type"); + T = PtrTy->getElementType(); + } } if (T->isAggregateType()) { // TODO: we should use methods which directly return MaybeAlign once such // are added to LLVM for AllocaInst and GlobalVariable - auto LocAlign = MaybeAlign(LocAlignN); auto ShdAlign = MaybeAlign(Shadow->getAlignment()); Module &M = *Shadow->getParent(); auto SizeVal = M.getDataLayout().getTypeStoreSize(T); @@ -679,10 +688,25 @@ static void fixupPrivateMemoryPFWILambdaCaptures(CallInst *PFWICall) { // Go through "byval" parameters which are passed as AS(0) pointers // and: (1) create local shadows for them (2) and initialize them from the // leader's copy and (3) replace usages with pointer to the shadow -static void shareByValParams(Function &F, const Triple &TT) { - // split +// +// Do the same for 'this' pointer which points to PFWG lamda object which is +// allocated in the caller. Caller is a kernel function which is generated by +// SYCL frontend. Kernel function allocates PFWG lambda object and initalizes +// captured objects (like accessors) using arguments of the kernel. After +// intialization kernel calls PFWG function (which is the operator() of the PFWG +// object). PFWG object captures all objects by value and all uses (except +// initialization from kernel arguments) of this values can only be in scope of +// PFWG function that is why copy back of PFWG object is not needed. +static void sharePFWGPrivateObjects(Function &F, const Triple &TT) { + // Skip alloca instructions and split. Alloca instructions must be in the + // beginning of the function otherwise they are considered as dynamic which + // can cause the problems with inlining. BasicBlock *EntryBB = &F.getEntryBlock(); - BasicBlock *LeaderBB = EntryBB->splitBasicBlock(&EntryBB->front(), "leader"); + Instruction *SplitPoint = &*EntryBB->begin(); + for (; SplitPoint->getOpcode() == Instruction::Alloca; + SplitPoint = SplitPoint->getNextNode()) + ; + BasicBlock *LeaderBB = EntryBB->splitBasicBlock(SplitPoint, "leader"); BasicBlock *MergeBB = LeaderBB->splitBasicBlock(&LeaderBB->front(), "merge"); // 1) rewire the above basic blocks so that LeaderBB is executed only for the @@ -692,38 +716,48 @@ static void shareByValParams(Function &F, const Triple &TT) { Instruction &At = LeaderBB->back(); for (auto &Arg : F.args()) { - if (!Arg.hasByValAttr()) - continue; - assert(Arg.getType()->getPointerAddressSpace() == - asUInt(spirv::AddrSpace::Private)); - Type *T = Arg.getParamByValType(); - - // 2) create the shared copy - "shadow" - for current byval arg - GlobalVariable *Shadow = - spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow"); + Type *T; + LLVMContext &Ctx = At.getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(&LeaderBB->front()); - // 3) replace argument with shadow in all uses - Value *RepVal = Shadow; - if (TT.isNVPTX()) { - // For NVPTX target address space inference for kernel arguments and - // allocas is happening in the backend (NVPTXLowerArgs and - // NVPTXLowerAlloca passes). After the frontend these pointers are in LLVM - // default address space 0 which is the generic address space for NVPTX - // target. - assert(Arg.getType()->getPointerAddressSpace() == 0); - - // Cast a pointer in the shared address space to the generic address - // space. + // 2) create the shared copy - "shadow" - for current arg + GlobalVariable *Shadow; + Value *RepVal; + if (Arg.hasByValAttr()) { + assert(Arg.getType()->getPointerAddressSpace() == + asUInt(spirv::AddrSpace::Private)); + T = Arg.getParamByValType(); + Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow"); + RepVal = Shadow; + if (TT.isNVPTX()) { + // For NVPTX target address space inference for kernel arguments and + // allocas is happening in the backend (NVPTXLowerArgs and + // NVPTXLowerAlloca passes). After the frontend these pointers are in + // LLVM default address space 0 which is the generic address space for + // NVPTX target. + assert(Arg.getType()->getPointerAddressSpace() == 0); + + // Cast a pointer in the shared address space to the generic address + // space. + RepVal = ConstantExpr::getPointerBitCastOrAddrSpaceCast(Shadow, + Arg.getType()); + } + } + // Process 'this' pointer which points to PFWG lambda object + else if (Arg.getArgNo() == 0) { + PointerType *PtrT = dyn_cast(Arg.getType()); + assert(PtrT && "Expected this pointer as the first argument"); + T = PtrT->getElementType(); + Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow"); RepVal = - ConstantExpr::getPointerBitCastOrAddrSpaceCast(Shadow, Arg.getType()); + Builder.CreatePointerBitCastOrAddrSpaceCast(Shadow, Arg.getType()); } + + // 3) replace argument with shadow in all uses for (auto *U : Arg.users()) U->replaceUsesOfWith(&Arg, RepVal); - // 4) fill the shadow from the argument for the leader WI only - LLVMContext &Ctx = At.getContext(); - IRBuilder<> Builder(Ctx); - Builder.SetInsertPoint(&LeaderBB->front()); copyBetweenPrivateAndShadow(&Arg, Shadow, Builder, true /*private->shadow*/); } @@ -832,8 +866,9 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F, const llvm::Triple &TT, for (auto *PFWICall : PFWICalls) fixupPrivateMemoryPFWILambdaCaptures(PFWICall); - // Finally, create shadows for and replace usages of byval pointer params - shareByValParams(F, TT); + // Finally, create shadows for and replace usages of byval pointer params and + // PFWG lambda object ('this' pointer). + sharePFWGPrivateObjects(F, TT); #ifndef NDEBUG if (HaveChanges && Debug > 0) diff --git a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll index b73087847dcb4..09ba788316dee 100644 --- a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll +++ b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll @@ -13,51 +13,55 @@ %struct.foo = type { %struct.barney } %struct.foo.0 = type { i8 } -; CHECK: @[[PFWG_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.bar addrspace(4)* +; CHECK: @[[GROUP_SHADOW_PTR:.*]] = internal unnamed_addr addrspace(3) global %struct.zot addrspace(4)* +; CHECK: @[[PFWG_SHADOW_PTR:.*]] = internal unnamed_addr addrspace(3) global %struct.bar addrspace(4)* ; CHECK: @[[PFWI_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.foo.0 +; CHECK: @[[PFWG_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.bar ; CHECK: @[[GROUP_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.zot define internal spir_func void @wibble(%struct.bar addrspace(4)* %arg, %struct.zot* byval(%struct.zot) align 8 %arg1) align 2 !work_group_scope !0 { ; CHECK-LABEL: @wibble( ; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_BAR:%.*]] addrspace(4)*, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1 ; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex ; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP0]], 0 ; CHECK-NEXT: br i1 [[CMPZ3]], label [[LEADER:%.*]], label [[MERGE:%.*]] ; CHECK: leader: ; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.zot* [[ARG1:%.*]] to i8* ; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.zot addrspace(3)* @[[GROUP_SHADOW]] to i8 addrspace(3)*), i8* align 8 [[TMP1]], i64 96, i1 false) +; CHECK-NEXT: [[ARG_CAST:%.*]] = bitcast [[STRUCT_BAR]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)* +; CHECK-NEXT: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.bar, [[STRUCT_BAR]] addrspace(3)* @[[PFWG_SHADOW]], i32 0, i32 0), i8 addrspace(4)* align 8 [[ARG_CAST]], i64 1, i1 false) ; CHECK-NEXT: br label [[MERGE]] ; CHECK: merge: -; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) -; CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_BAR:%.*]] addrspace(4)*, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1 -; CHECK-NEXT: [[ID:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex -; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[ID]], 0 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex +; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP3]], 0 ; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]] ; CHECK: wg_leader: -; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[ARG:%.*]], [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8 +; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* addrspacecast (%struct.bar addrspace(3)* @[[PFWG_SHADOW]] to [[STRUCT_BAR]] addrspace(4)*), [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [[STRUCT_ZOT:%.*]] addrspace(3)* @[[GROUP_SHADOW]] to [[STRUCT_ZOT]] addrspace(4)* -; CHECK-NEXT: store [[STRUCT_ZOT]] addrspace(4)* [[TMP4]], [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @wibbleWG_tmp4 +; CHECK-NEXT: store [[STRUCT_ZOT]] addrspace(4)* [[TMP4]], [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @[[GROUP_SHADOW_PTR]] ; CHECK-NEXT: br label [[WG_CF]] ; CHECK: wg_cf: -; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex -; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex +; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[CMPZ2]], label [[TESTMAT:%.*]], label [[LEADERMAT:%.*]] ; CHECK: TestMat: -; CHECK-NEXT: [[TMP4:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8* -; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i8* align 1 [[TMP4]], i64 1, i1 false) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i8* align 1 [[TMP5]], i64 1, i1 false) ; CHECK-NEXT: [[MAT_LD:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)** [[TMP]] -; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD]], [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW]] +; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD]], [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW_PTR]] ; CHECK-NEXT: br label [[LEADERMAT]] ; CHECK: LeaderMat: -; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) -; CHECK-NEXT: [[MAT_LD1:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW]] +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0 +; CHECK-NEXT: [[MAT_LD1:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW_PTR]] ; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD1]], [[STRUCT_BAR]] addrspace(4)** [[TMP]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8* -; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 1 [[TMP5]], i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i64 1, i1 false) -; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) -; CHECK-NEXT: [[WG_VAL_TMP4:%.*]] = load [[STRUCT_ZOT]] addrspace(4)*, [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @wibbleWG_tmp4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 1 [[TMP6]], i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i64 1, i1 false) +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0 +; CHECK-NEXT: [[WG_VAL_TMP4:%.*]] = load [[STRUCT_ZOT]] addrspace(4)*, [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @[[GROUP_SHADOW_PTR]] ; CHECK-NEXT: call spir_func void @bar(%struct.zot addrspace(4)* [[WG_VAL_TMP4]], %struct.foo.0* byval(%struct.foo.0) align 1 [[TMP2]]) ; CHECK-NEXT: ret void ;