From 99549f4fa15d1b3c3903fd8f220aa0b7128b1f34 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 28 Jul 2022 16:21:42 -0700 Subject: [PATCH 1/6] [SYCL] Enable discard_events mode for the Level Zero --- sycl/include/sycl/detail/pi.h | 2 + sycl/plugins/level_zero/pi_level_zero.cpp | 302 ++++++++++++---------- sycl/plugins/level_zero/pi_level_zero.hpp | 7 + sycl/source/detail/queue_impl.hpp | 21 +- 4 files changed, 192 insertions(+), 140 deletions(-) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 67094702273d6..6e159a2eba59f 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -564,6 +564,8 @@ constexpr pi_queue_properties PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE = (1 << 0); constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE = (1 << 1); constexpr pi_queue_properties PI_QUEUE_ON_DEVICE = (1 << 2); constexpr pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT = (1 << 3); +constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS_MODE_ENABLE = + (1 << 4); using pi_result = _pi_result; using pi_platform_info = _pi_platform_info; diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 4c528da29b7a6..f405a2538d25c 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -674,7 +674,8 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, // the host-visible pool inline static pi_result createEventAndAssociateQueue( pi_queue Queue, pi_event *Event, pi_command_type CommandType, - pi_command_list_ptr_t CommandList, bool ForceHostVisible = false) { + pi_command_list_ptr_t CommandList, bool IsInternal = false, + bool ForceHostVisible = false) { PI_CALL(EventCreate(Queue->Context, Queue, ForceHostVisible ? true : EventsScope == AllHostVisible, @@ -682,6 +683,7 @@ inline static pi_result createEventAndAssociateQueue( (*Event)->Queue = Queue; (*Event)->CommandType = CommandType; + (*Event)->Internal = IsInternal; // Append this Event to the CommandList, if any if (CommandList != Queue->CommandListMap.end()) { @@ -701,8 +703,11 @@ inline static pi_result createEventAndAssociateQueue( // But we have to ensure that the event is not destroyed before // it is really signalled, so retain it explicitly here and // release in CleanupCompletedEvent(Event). - // - PI_CALL(piEventRetain(*Event)); + // If the event is internal then don't increment the reference count as this + // event will not be waited/released by SYCL RT, so it must be destroyed by + // EventRelease in resetCommandList. + if (!IsInternal) + PI_CALL(piEventRetain(*Event)); return PI_SUCCESS; } @@ -938,6 +943,11 @@ bool _pi_queue::isInOrderQueue() const { return ((this->Properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0); } +bool _pi_queue::isDiscardEvents() const { + return ((this->Properties & PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS_MODE_ENABLE) != + 0); +} + pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, bool MakeAvailable, @@ -1556,45 +1566,56 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // afterwards. if (EventsScope == LastCommandInBatchHostVisible && !CommandList->second.EventList.empty()) { - // Create a "proxy" host-visible event. - // - pi_event HostVisibleEvent; - auto Res = createEventAndAssociateQueue( - this, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList, true); - if (Res) - return Res; + // If there are only internal events in the command list then we don't + // need to create host proxy event. + auto Result = std::find_if(CommandList->second.EventList.begin(), + CommandList->second.EventList.end(), + [](pi_event E) { return !E->Internal; }); + if (Result != CommandList->second.EventList.end()) { + // Create a "proxy" host-visible event. + // + pi_event HostVisibleEvent; + auto Res = createEventAndAssociateQueue( + this, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList, + /* IsInternal */ false, /* ForceHostVisible */ true); + if (Res) + return Res; + + // Update each command's event in the command-list to "see" this + // proxy event as a host-visible counterpart. + for (auto &Event : CommandList->second.EventList) { + std::scoped_lock EventLock(Event->Mutex); + // Internal event doesn't need host-visible proxy. + if (Event->Internal) + continue; - // Update each command's event in the command-list to "see" this - // proxy event as a host-visible counterpart. - for (auto &Event : CommandList->second.EventList) { - std::scoped_lock EventLock(Event->Mutex); - if (!Event->HostVisibleEvent) { - Event->HostVisibleEvent = HostVisibleEvent; - PI_CALL(piEventRetain(HostVisibleEvent)); + if (!Event->HostVisibleEvent) { + Event->HostVisibleEvent = HostVisibleEvent; + PI_CALL(piEventRetain(HostVisibleEvent)); + } } - } - // Decrement the reference count of the event such that all the remaining - // references are from the other commands in this batch and from the - // command-list itself. This host-visible event will not be - // waited/released by SYCL RT, so it must be destroyed after all events in - // the batch are gone. - // We know that refcount is more than 2 because we check that EventList of - // the command list is not empty above, i.e. after - // createEventAndAssociateQueue ref count is 2 and then +1 for each event - // in the EventList. - PI_CALL(piEventRelease(HostVisibleEvent)); - PI_CALL(piEventRelease(HostVisibleEvent)); - - // Indicate no cleanup is needed for this PI event as it is special. - HostVisibleEvent->CleanedUp = true; - - // Finally set to signal the host-visible event at the end of the - // command-list. - // TODO: see if we need a barrier here (or explicit wait for all events in - // the batch). - ZE_CALL(zeCommandListAppendSignalEvent, - (CommandList->first, HostVisibleEvent->ZeEvent)); + // Decrement the reference count of the event such that all the + // remaining references are from the other commands in this batch and + // from the command-list itself. This host-visible event will not be + // waited/released by SYCL RT, so it must be destroyed after all events + // in the batch are gone. We know that refcount is more than 2 because + // we check that EventList of the command list is not empty above, i.e. + // after createEventAndAssociateQueue ref count is 2 and then +1 for + // each event in the EventList. + PI_CALL(piEventRelease(HostVisibleEvent)); + PI_CALL(piEventRelease(HostVisibleEvent)); + + // Indicate no cleanup is needed for this PI event as it is special. + HostVisibleEvent->CleanedUp = true; + + // Finally set to signal the host-visible event at the end of the + // command-list. + // TODO: see if we need a barrier here (or explicit wait for all events + // in the batch). + ZE_CALL(zeCommandListAppendSignalEvent, + (CommandList->first, HostVisibleEvent->ZeEvent)); + } } // Close the command list and have it ready for dispatch. @@ -3404,7 +3425,8 @@ pi_result piQueueCreate(pi_context Context, pi_device Device, // Check that unexpected bits are not set. PI_ASSERT(!(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE | - PI_QUEUE_ON_DEVICE_DEFAULT)), + PI_QUEUE_ON_DEVICE_DEFAULT | + PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS_MODE_ENABLE)), PI_ERROR_INVALID_VALUE); PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); @@ -5212,10 +5234,9 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event) { + const pi_event *EventWaitList, pi_event *OutEvent) { PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION); // Lock automatically releases when this goes out of scope. @@ -5322,8 +5343,11 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, return Res; ze_event_handle_t ZeEvent = nullptr; + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; pi_result Res = createEventAndAssociateQueue( - Queue, Event, PI_COMMAND_TYPE_NDRANGE_KERNEL, CommandList); + Queue, Event, PI_COMMAND_TYPE_NDRANGE_KERNEL, CommandList, IsInternal); if (Res != PI_SUCCESS) return Res; ZeEvent = (*Event)->ZeEvent; @@ -5441,8 +5465,8 @@ _pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &ZeHostVisibleEvent) { return Res; // Create a "proxy" host-visible event. - auto Res = createEventAndAssociateQueue( - Queue, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList, true); + auto Res = createEventAndAssociateQueue(Queue, &HostVisibleEvent, + PI_COMMAND_TYPE_USER, CommandList); // HostVisibleEvent->CleanedUp = true; if (Res != PI_SUCCESS) return Res; @@ -5466,6 +5490,7 @@ pi_result _pi_event::reset() { CommandData = nullptr; CommandType = PI_COMMAND_TYPE_USER; WaitList = {}; + Internal = false; RefCount.reset(); if (!isHostVisible()) @@ -5806,6 +5831,9 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { // thus proxy events can be waited without a deadlock. // for (uint32_t I = 0; I < NumEvents; I++) { + if (EventList[I]->Internal) + die("piEventsWait must not be called for an internal event"); + ze_event_handle_t ZeHostVisibleEvent; if (auto Res = EventList[I]->getOrCreateHostVisibleEvent(ZeHostVisibleEvent)) @@ -5827,6 +5855,9 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { for (uint32_t I = 0; I < NumEvents; I++) { { std::shared_lock EventLock(EventList[I]->Mutex); + if (EventList[I]->Internal) + die("piEventsWait must not be called for an internal event"); + if (!EventList[I]->Completed) { auto HostVisibleEvent = EventList[I]->HostVisibleEvent; if (!HostVisibleEvent) @@ -6145,10 +6176,10 @@ pi_result piSamplerRelease(pi_sampler Sampler) { // Queue Commands // pi_result piEnqueueEventsWait(pi_queue Queue, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event) { + const pi_event *EventWaitList, + pi_event *OutEvent) { PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); if (EventWaitList) { PI_ASSERT(NumEventsInWaitList > 0, PI_ERROR_INVALID_VALUE); @@ -6170,10 +6201,14 @@ pi_result piEnqueueEventsWait(pi_queue Queue, pi_uint32 NumEventsInWaitList, return Res; ze_event_handle_t ZeEvent = nullptr; + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, - CommandList); + CommandList, IsInternal); if (Res != PI_SUCCESS) return Res; + ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -6198,17 +6233,21 @@ pi_result piEnqueueEventsWait(pi_queue Queue, pi_uint32 NumEventsInWaitList, // Lock automatically releases when this goes out of scope. std::scoped_lock lock(Queue->Mutex); - auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, - Queue->CommandListMap.end()); - if (Res != PI_SUCCESS) - return Res; + if (OutEvent) { + auto Res = createEventAndAssociateQueue( + Queue, OutEvent, PI_COMMAND_TYPE_USER, Queue->CommandListMap.end()); + if (Res != PI_SUCCESS) + return Res; + } Queue->synchronize(); - Queue->LastCommandEvent = *Event; + if (OutEvent) { + Queue->LastCommandEvent = *OutEvent; - ZE_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); - (*Event)->Completed = true; + ZE_CALL(zeEventHostSignal, ((*OutEvent)->ZeEvent)); + (*OutEvent)->Completed = true; + } } resetCommandLists(Queue); @@ -6219,9 +6258,8 @@ pi_result piEnqueueEventsWait(pi_queue Queue, pi_uint32 NumEventsInWaitList, pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, - pi_event *Event) { + pi_event *OutEvent) { PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); // Lock automatically releases when this goes out of scope. std::scoped_lock lock(Queue->Mutex); @@ -6229,9 +6267,10 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, // Helper function for appending a barrier to a command list. auto insertBarrierIntoCmdList = [&Queue](pi_command_list_ptr_t CmdList, - const _pi_ze_event_list_t &EventWaitList, pi_event &Event) { + const _pi_ze_event_list_t &EventWaitList, pi_event &Event, + bool IsInternal) { if (auto Res = createEventAndAssociateQueue( - Queue, &Event, PI_COMMAND_TYPE_USER, CmdList)) + Queue, &Event, PI_COMMAND_TYPE_USER, CmdList, IsInternal)) return Res; Event->WaitList = EventWaitList; ZE_CALL(zeCommandListAppendBarrier, @@ -6240,6 +6279,10 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, return PI_SUCCESS; }; + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + // Indicator for whether batching is allowed. This may be changed later in // this function, but allow it by default. bool OkToBatch = true; @@ -6265,8 +6308,10 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, return Res; // Insert the barrier into the command-list and execute. - if (auto Res = insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event)) + if (auto Res = + insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal)) return Res; + if (auto Res = Queue->executeCommandList(CmdList, false, OkToBatch)) return Res; @@ -6332,7 +6377,7 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, std::vector EventWaitVector(CmdLists.size()); for (size_t I = 0; I < CmdLists.size(); ++I) if (auto Res = insertBarrierIntoCmdList(CmdLists[I], _pi_ze_event_list_t{}, - EventWaitVector[I])) + EventWaitVector[I], false)) return Res; if (CmdLists.size() > 1) { @@ -6355,8 +6400,8 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, // Insert a barrier with the events from each command-queue into the // convergence command list. The resulting event signals the convergence of // all barriers. - if (auto Res = - insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList, *Event)) + if (auto Res = insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList, + *Event, IsInternal)) return Res; } else { // If there is only a single queue we have inserted all the barriers we need @@ -6479,15 +6524,13 @@ pi_result _pi_queue::synchronize() { // Shared by all memory read/write/copy PI interfaces. // PI interfaces must have queue's and destination buffer's mutexes locked for // exclusive use and source buffer's mutex locked for shared use on entry. -static pi_result enqueueMemCopyHelper(pi_command_type CommandType, - pi_queue Queue, void *Dst, - pi_bool BlockingWrite, size_t Size, - const void *Src, - pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, - pi_event *Event, bool PreferCopyEngine) { +static pi_result +enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst, + pi_bool BlockingWrite, size_t Size, const void *Src, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *OutEvent, + bool PreferCopyEngine) { PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); @@ -6506,8 +6549,11 @@ static pi_result enqueueMemCopyHelper(pi_command_type CommandType, return Res; ze_event_handle_t ZeEvent = nullptr; - auto Res = - createEventAndAssociateQueue(Queue, Event, CommandType, CommandList); + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, + CommandList, IsInternal); if (Res != PI_SUCCESS) return Res; ZeEvent = (*Event)->ZeEvent; @@ -6545,10 +6591,9 @@ static pi_result enqueueMemCopyRectHelper( pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region, size_t SrcRowPitch, size_t DstRowPitch, size_t SrcSlicePitch, size_t DstSlicePitch, pi_bool Blocking, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event, bool PreferCopyEngine) { + const pi_event *EventWaitList, pi_event *OutEvent, bool PreferCopyEngine) { PI_ASSERT(Region && SrcOrigin && DstOrigin && Queue, PI_ERROR_INVALID_VALUE); - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); @@ -6567,8 +6612,11 @@ static pi_result enqueueMemCopyRectHelper( return Res; ze_event_handle_t ZeEvent = nullptr; - auto Res = - createEventAndAssociateQueue(Queue, Event, CommandType, CommandList); + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, + CommandList, IsInternal); if (Res != PI_SUCCESS) return Res; ZeEvent = (*Event)->ZeEvent; @@ -6761,9 +6809,8 @@ static pi_result enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, const void *Pattern, size_t PatternSize, size_t Size, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event) { + const pi_event *EventWaitList, pi_event *OutEvent) { PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); // Pattern size must be a power of two. PI_ASSERT((PatternSize > 0) && ((PatternSize & (PatternSize - 1)) == 0), PI_ERROR_INVALID_VALUE); @@ -6813,10 +6860,14 @@ enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, return Res; ze_event_handle_t ZeEvent = nullptr; - auto Res = - createEventAndAssociateQueue(Queue, Event, CommandType, CommandList); + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, + CommandList, IsInternal); if (Res != PI_SUCCESS) return Res; + ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -6873,19 +6924,21 @@ static pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context, pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Mem, pi_bool BlockingMap, pi_map_flags MapFlags, size_t Offset, size_t Size, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event, - void **RetMap) { + const pi_event *EventWaitList, + pi_event *OutEvent, void **RetMap) { // TODO: we don't implement read-only or write-only, always read-write. // assert((map_flags & PI_MAP_READ) != 0); // assert((map_flags & PI_MAP_WRITE) != 0); PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); PI_ASSERT(!Mem->isImage(), PI_ERROR_INVALID_MEM_OBJECT); auto Buffer = pi_cast(Mem); + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; ze_event_handle_t ZeEvent = nullptr; bool UseCopyEngine = false; @@ -6898,11 +6951,12 @@ pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Mem, pi_bool BlockingMap, NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)) return Res; - auto Res = createEventAndAssociateQueue(Queue, Event, - PI_COMMAND_TYPE_MEM_BUFFER_MAP, - Queue->CommandListMap.end()); + auto Res = createEventAndAssociateQueue( + Queue, Event, PI_COMMAND_TYPE_MEM_BUFFER_MAP, + Queue->CommandListMap.end(), IsInternal); if (Res != PI_SUCCESS) return Res; + ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; } @@ -6936,21 +6990,11 @@ pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Mem, pi_bool BlockingMap, // For integrated devices the buffer has been allocated in host memory. if (Buffer->OnHost) { // Wait on incoming events before doing the copy - PI_CALL(piEventsWait(NumEventsInWaitList, EventWaitList)); + if (NumEventsInWaitList > 0) + PI_CALL(piEventsWait(NumEventsInWaitList, EventWaitList)); - if (Queue->isInOrderQueue()) { - pi_event TmpLastCommandEvent = nullptr; - - { - // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(Queue->Mutex); - TmpLastCommandEvent = Queue->LastCommandEvent; - } - - if (TmpLastCommandEvent != nullptr) { - PI_CALL(piEventsWait(1, &TmpLastCommandEvent)); - } - } + if (Queue->isInOrderQueue()) + PI_CALL(piQueueFinish(Queue)); // Lock automatically releases when this goes out of scope. std::scoped_lock Guard(Buffer->Mutex); @@ -7037,10 +7081,9 @@ pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Mem, pi_bool BlockingMap, pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem Mem, void *MappedPtr, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event) { + const pi_event *EventWaitList, pi_event *OutEvent) { PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); PI_ASSERT(!Mem->isImage(), PI_ERROR_INVALID_MEM_OBJECT); auto Buffer = pi_cast(Mem); @@ -7048,6 +7091,9 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem Mem, void *MappedPtr, bool UseCopyEngine = false; ze_event_handle_t ZeEvent = nullptr; + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; { // Lock automatically releases when this goes out of scope. std::scoped_lock lock(Queue->Mutex); @@ -7057,9 +7103,9 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem Mem, void *MappedPtr, NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)) return Res; - auto Res = createEventAndAssociateQueue(Queue, Event, - PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, - Queue->CommandListMap.end()); + auto Res = createEventAndAssociateQueue( + Queue, Event, PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, + Queue->CommandListMap.end(), IsInternal); if (Res != PI_SUCCESS) return Res; ZeEvent = (*Event)->ZeEvent; @@ -7093,21 +7139,11 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem Mem, void *MappedPtr, // For integrated devices the buffer is allocated in host memory. if (Buffer->OnHost) { // Wait on incoming events before doing the copy - PI_CALL(piEventsWait(NumEventsInWaitList, EventWaitList)); + if (NumEventsInWaitList > 0) + PI_CALL(piEventsWait(NumEventsInWaitList, EventWaitList)); - if (Queue->isInOrderQueue()) { - pi_event TmpLastCommandEvent = nullptr; - - { - // Lock automatically releases when this goes out of scope. - std::shared_lock lock(Queue->Mutex); - TmpLastCommandEvent = Queue->LastCommandEvent; - } - - if (TmpLastCommandEvent != nullptr) { - PI_CALL(piEventsWait(1, &TmpLastCommandEvent)); - } - } + if (Queue->isInOrderQueue()) + PI_CALL(piQueueFinish(Queue)); char *ZeHandleDst; PI_CALL( @@ -7225,9 +7261,8 @@ static pi_result enqueueMemImageCommandHelper( pi_bool IsBlocking, pi_image_offset SrcOrigin, pi_image_offset DstOrigin, pi_image_region Region, size_t RowPitch, size_t SlicePitch, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, - pi_event *Event, bool PreferCopyEngine = false) { + pi_event *OutEvent, bool PreferCopyEngine = false) { PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); @@ -7246,8 +7281,11 @@ static pi_result enqueueMemImageCommandHelper( return Res; ze_event_handle_t ZeEvent = nullptr; - auto Res = - createEventAndAssociateQueue(Queue, Event, CommandType, CommandList); + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; + auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, + CommandList, IsInternal); if (Res != PI_SUCCESS) return Res; ZeEvent = (*Event)->ZeEvent; @@ -8208,12 +8246,11 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size, pi_usm_migration_flags Flags, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, - pi_event *Event) { + pi_event *OutEvent) { // flags is currently unused so fail if set PI_ASSERT(Flags == 0, PI_ERROR_INVALID_VALUE); PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); // Lock automatically releases when this goes out of scope. std::scoped_lock lock(Queue->Mutex); @@ -8241,8 +8278,11 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size, // TODO: do we need to create a unique command type for this? ze_event_handle_t ZeEvent = nullptr; + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, - CommandList); + CommandList, IsInternal); if (Res != PI_SUCCESS) return Res; ZeEvent = (*Event)->ZeEvent; @@ -8277,9 +8317,8 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size, /// pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, size_t Length, pi_mem_advice Advice, - pi_event *Event) { + pi_event *OutEvent) { PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); // Lock automatically releases when this goes out of scope. std::scoped_lock lock(Queue->Mutex); @@ -8304,8 +8343,11 @@ pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, // TODO: do we need to create a unique command type for this? ze_event_handle_t ZeEvent = nullptr; + pi_event InternalEvent; + bool IsInternal = OutEvent == nullptr; + pi_event *Event = OutEvent ? OutEvent : &InternalEvent; auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, - CommandList); + CommandList, IsInternal); if (Res != PI_SUCCESS) return Res; ZeEvent = (*Event)->ZeEvent; diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index d6a08ba03e6f8..41202e379e861 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -965,6 +965,9 @@ struct _pi_queue : _pi_object { // Returns true if the queue is a in-order queue. bool isInOrderQueue() const; + // Returns true if the queue has discard events property. + bool isDiscardEvents() const; + // adjust the queue's batch size, knowing that the current command list // is being closed with a full batch. // For copy commands, IsCopy is set to 'true'. @@ -1374,6 +1377,10 @@ struct _pi_event : _pi_object { // being visible to the host at all. bool Completed = {false}; + // Indicates that event is internal, i.e. it is visible inside the L0 plugin + // only. + bool Internal = {false}; + // Reset _pi_event object. pi_result reset(); }; diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 4117bdc0b7d8f..f2f4cd292c7b0 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -96,11 +96,8 @@ class queue_impl { MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MHasDiscardEventsSupport( - MDiscardEvents && - (MHostQueue ? true - : (MIsInorder && getPlugin().getBackend() != - backend::ext_oneapi_level_zero))) { + MHasDiscardEventsSupport(MDiscardEvents && + (MHostQueue ? true : MIsInorder)) { if (has_property() && has_property()) { throw sycl::exception(make_error_code(errc::invalid), @@ -144,11 +141,8 @@ class queue_impl { MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MHasDiscardEventsSupport( - MDiscardEvents && - (MHostQueue ? true - : (MIsInorder && getPlugin().getBackend() != - backend::ext_oneapi_level_zero))) { + MHasDiscardEventsSupport(MDiscardEvents && + (MHostQueue ? true : MIsInorder)) { if (has_property() && has_property()) { throw sycl::exception(make_error_code(errc::invalid), @@ -317,6 +311,13 @@ class queue_impl { ext::oneapi::cuda::property::queue::use_default_stream>()) { CreationFlags |= __SYCL_PI_CUDA_USE_DEFAULT_STREAM; } + if (getPlugin().getBackend() == backend::ext_oneapi_level_zero && + MPropList + .has_property()) { + // Pass this flag to the Level Zero plugin to be able to check it from + // queue property. + CreationFlags |= PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS_MODE_ENABLE; + } RT::PiQueue Queue{}; RT::PiContext Context = MContext->getHandleRef(); RT::PiDevice Device = MDevice->getHandleRef(); From 8e09c76ca2b367f15ae80a8e225f35cc3826e880 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 5 Aug 2022 15:17:25 -0700 Subject: [PATCH 2/6] Fix accidental mistake --- sycl/plugins/level_zero/pi_level_zero.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index f405a2538d25c..d96382a1291bf 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -670,7 +670,8 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, // \param Event a pointer to hold the newly created pi_event // \param CommandType various command type determined by the caller // \param CommandList is the command list where the event is added -// \param ForceHostVisible tells if the event must be created in +// \param IsInternal tells if the event is internal, i.e. visible in the L0 +// plugin only. \param ForceHostVisible tells if the event must be created in // the host-visible pool inline static pi_result createEventAndAssociateQueue( pi_queue Queue, pi_event *Event, pi_command_type CommandType, @@ -5465,9 +5466,9 @@ _pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &ZeHostVisibleEvent) { return Res; // Create a "proxy" host-visible event. - auto Res = createEventAndAssociateQueue(Queue, &HostVisibleEvent, - PI_COMMAND_TYPE_USER, CommandList); - // HostVisibleEvent->CleanedUp = true; + auto Res = createEventAndAssociateQueue( + Queue, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList, + /* IsInternal */ false, /* ForceHostVisible */ true); if (Res != PI_SUCCESS) return Res; From 432ccef3bd2e552d5885c9ad677f1937e0750587 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 5 Aug 2022 15:20:20 -0700 Subject: [PATCH 3/6] Fix formatting --- sycl/plugins/level_zero/pi_level_zero.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index d96382a1291bf..346c38b7c013d 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -671,7 +671,8 @@ ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *ZeName, // \param CommandType various command type determined by the caller // \param CommandList is the command list where the event is added // \param IsInternal tells if the event is internal, i.e. visible in the L0 -// plugin only. \param ForceHostVisible tells if the event must be created in +// plugin only. +// \param ForceHostVisible tells if the event must be created in // the host-visible pool inline static pi_result createEventAndAssociateQueue( pi_queue Queue, pi_event *Event, pi_command_type CommandType, From c30be94f53b2e69dd63443a5ae7533576361ba0b Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Tue, 9 Aug 2022 17:01:19 -0700 Subject: [PATCH 4/6] Update PI version --- sycl/include/sycl/detail/pi.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 6e159a2eba59f..9343314f8f7d9 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -49,9 +49,11 @@ // NOTE that this results in a changed API for `piProgramGetBuildInfo`. // 10.12 Change enum value PI_MEM_ADVICE_UNKNOWN from 0 to 999, and set enum // PI_MEM_ADVISE_RESET to 0. +// 10.13 Added new PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS_MODE_ENABLE queue +// property. #define _PI_H_VERSION_MAJOR 10 -#define _PI_H_VERSION_MINOR 12 +#define _PI_H_VERSION_MINOR 13 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) From a0d9e50a8c7ba02f717aba191c35d726e4b5b719 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 18 Aug 2022 12:52:25 -0700 Subject: [PATCH 5/6] Address review comments * Use RefCountExternal to keep track of external references * Rename PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS_MODE_ENABLE -> PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS * Pass property to all backends --- sycl/include/sycl/detail/pi.h | 6 +- sycl/plugins/level_zero/pi_level_zero.cpp | 74 +++++++++++++---------- sycl/plugins/level_zero/pi_level_zero.hpp | 20 +++++- sycl/plugins/opencl/pi_opencl.cpp | 14 ++++- sycl/source/detail/queue_impl.hpp | 5 +- 5 files changed, 77 insertions(+), 42 deletions(-) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 9343314f8f7d9..0d292bb51038b 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -49,8 +49,7 @@ // NOTE that this results in a changed API for `piProgramGetBuildInfo`. // 10.12 Change enum value PI_MEM_ADVICE_UNKNOWN from 0 to 999, and set enum // PI_MEM_ADVISE_RESET to 0. -// 10.13 Added new PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS_MODE_ENABLE queue -// property. +// 10.13 Added new PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS queue property. #define _PI_H_VERSION_MAJOR 10 #define _PI_H_VERSION_MINOR 13 @@ -566,8 +565,7 @@ constexpr pi_queue_properties PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE = (1 << 0); constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE = (1 << 1); constexpr pi_queue_properties PI_QUEUE_ON_DEVICE = (1 << 2); constexpr pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT = (1 << 3); -constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS_MODE_ENABLE = - (1 << 4); +constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS = (1 << 4); using pi_result = _pi_result; using pi_platform_info = _pi_platform_info; diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 346c38b7c013d..8f8888280cb76 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -31,6 +31,7 @@ extern "C" { // Forward declarartions. static pi_result piQueueReleaseInternal(pi_queue Queue); +static pi_result piEventReleaseInternal(pi_event Event); static pi_result EventCreate(pi_context Context, pi_queue Queue, bool HostVisible, pi_event *RetEvent); } @@ -685,12 +686,11 @@ inline static pi_result createEventAndAssociateQueue( (*Event)->Queue = Queue; (*Event)->CommandType = CommandType; - (*Event)->Internal = IsInternal; // Append this Event to the CommandList, if any if (CommandList != Queue->CommandListMap.end()) { CommandList->second.append(*Event); - PI_CALL(piEventRetain(*Event)); + (*Event)->RefCount.increment(); } // We need to increment the reference counter here to avoid pi_queue @@ -946,8 +946,7 @@ bool _pi_queue::isInOrderQueue() const { } bool _pi_queue::isDiscardEvents() const { - return ((this->Properties & PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS_MODE_ENABLE) != - 0); + return ((this->Properties & PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS) != 0); } pi_result @@ -1202,7 +1201,7 @@ CleanupEventListFromResetCmdList(std::vector &EventListToCleanup, PI_CALL(CleanupCompletedEvent(Event, QueueLocked)); // This event was removed from the command list, so decrement ref count // (it was incremented when they were added to the command list). - PI_CALL(piEventRelease(Event)); + PI_CALL(piEventReleaseInternal(Event)); } return PI_SUCCESS; } @@ -1570,9 +1569,10 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, !CommandList->second.EventList.empty()) { // If there are only internal events in the command list then we don't // need to create host proxy event. - auto Result = std::find_if(CommandList->second.EventList.begin(), - CommandList->second.EventList.end(), - [](pi_event E) { return !E->Internal; }); + auto Result = + std::find_if(CommandList->second.EventList.begin(), + CommandList->second.EventList.end(), + [](pi_event E) { return E->hasExternalRefs(); }); if (Result != CommandList->second.EventList.end()) { // Create a "proxy" host-visible event. // @@ -1588,12 +1588,12 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, for (auto &Event : CommandList->second.EventList) { std::scoped_lock EventLock(Event->Mutex); // Internal event doesn't need host-visible proxy. - if (Event->Internal) + if (!Event->hasExternalRefs()) continue; if (!Event->HostVisibleEvent) { Event->HostVisibleEvent = HostVisibleEvent; - PI_CALL(piEventRetain(HostVisibleEvent)); + HostVisibleEvent->RefCount.increment(); } } @@ -1605,8 +1605,8 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // we check that EventList of the command list is not empty above, i.e. // after createEventAndAssociateQueue ref count is 2 and then +1 for // each event in the EventList. - PI_CALL(piEventRelease(HostVisibleEvent)); - PI_CALL(piEventRelease(HostVisibleEvent)); + PI_CALL(piEventReleaseInternal(HostVisibleEvent)); + PI_CALL(piEventReleaseInternal(HostVisibleEvent)); // Indicate no cleanup is needed for this PI event as it is special. HostVisibleEvent->CleanedUp = true; @@ -1825,7 +1825,7 @@ pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, // We can now release all the active barriers and replace them with the ones // in the wait list. for (pi_event &BarrierEvent : ActiveBarriers) - PI_CALL(piEventRelease(BarrierEvent)); + PI_CALL(piEventReleaseInternal(BarrierEvent)); ActiveBarriers.clear(); ActiveBarriers.insert( ActiveBarriers.end(), ActiveBarriersWaitList.PiEventList, @@ -1985,7 +1985,7 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( } for (pi_uint32 I = 0; I < this->Length; I++) { - PI_CALL(piEventRetain(this->PiEventList[I])); + this->PiEventList[I]->RefCount.increment(); } return PI_SUCCESS; @@ -3428,7 +3428,7 @@ pi_result piQueueCreate(pi_context Context, pi_device Device, PI_ASSERT(!(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE | PI_QUEUE_ON_DEVICE_DEFAULT | - PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS_MODE_ENABLE)), + PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS)), PI_ERROR_INVALID_VALUE); PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); @@ -3606,7 +3606,7 @@ pi_result piQueueRelease(pi_queue Queue) { PI_CALL(CleanupCompletedEvent(Event)); // This event was removed from the command list, so decrement ref count // (it was incremented when they were added to the command list). - PI_CALL(piEventRelease(Event)); + PI_CALL(piEventReleaseInternal(Event)); } PI_CALL(piQueueReleaseInternal(Queue)); return PI_SUCCESS; @@ -5492,7 +5492,7 @@ pi_result _pi_event::reset() { CommandData = nullptr; CommandType = PI_COMMAND_TYPE_USER; WaitList = {}; - Internal = false; + RefCountExternal = 0; RefCount.reset(); if (!isHostVisible()) @@ -5586,7 +5586,10 @@ static pi_result EventCreate(pi_context Context, pi_queue Queue, // Exteral PI API entry pi_result piEventCreate(pi_context Context, pi_event *RetEvent) { - return EventCreate(Context, nullptr, EventsScope == AllHostVisible, RetEvent); + pi_result Result = + EventCreate(Context, nullptr, EventsScope == AllHostVisible, RetEvent); + (*RetEvent)->RefCountExternal++; + return Result; } pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName, @@ -5782,7 +5785,7 @@ static pi_result CleanupCompletedEvent(pi_event Event, bool QueueLocked) { // association with queue. Events which don't have associated queue doesn't // require this release because it means that they are not created using // createEventAndAssociateQueue, i.e. additional retain was not made. - PI_CALL(piEventRelease(Event)); + PI_CALL(piEventReleaseInternal(Event)); } // The list of dependent events will be appended to as we walk it so that this @@ -5814,7 +5817,7 @@ static pi_result CleanupCompletedEvent(pi_event Event, bool QueueLocked) { } if (DepEventKernel) PI_CALL(piKernelRelease(pi_cast(DepEvent->CommandData))); - PI_CALL(piEventRelease(DepEvent)); + PI_CALL(piEventReleaseInternal(DepEvent)); } return PI_SUCCESS; @@ -5833,7 +5836,7 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { // thus proxy events can be waited without a deadlock. // for (uint32_t I = 0; I < NumEvents; I++) { - if (EventList[I]->Internal) + if (!EventList[I]->hasExternalRefs()) die("piEventsWait must not be called for an internal event"); ze_event_handle_t ZeHostVisibleEvent; @@ -5857,7 +5860,7 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { for (uint32_t I = 0; I < NumEvents; I++) { { std::shared_lock EventLock(EventList[I]->Mutex); - if (EventList[I]->Internal) + if (!EventList[I]->hasExternalRefs()) die("piEventsWait must not be called for an internal event"); if (!EventList[I]->Completed) { @@ -5906,12 +5909,21 @@ pi_result piEventSetStatus(pi_event Event, pi_int32 ExecutionStatus) { } pi_result piEventRetain(pi_event Event) { + PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); + Event->RefCountExternal++; Event->RefCount.increment(); return PI_SUCCESS; } pi_result piEventRelease(pi_event Event) { PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); + Event->RefCountExternal--; + PI_CALL(piEventReleaseInternal(Event)); + return PI_SUCCESS; +} + +static pi_result piEventReleaseInternal(pi_event Event) { + PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); if (!Event->RefCount.decrementAndTest()) return PI_SUCCESS; @@ -5936,7 +5948,7 @@ pi_result piEventRelease(pi_event Event) { // and release a reference to it. if (Event->HostVisibleEvent && Event->HostVisibleEvent != Event) { // Decrement ref-count of the host-visible proxy event. - PI_CALL(piEventRelease(Event->HostVisibleEvent)); + PI_CALL(piEventReleaseInternal(Event->HostVisibleEvent)); } // We intentionally incremented the reference counter when an event is @@ -6319,7 +6331,7 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, if (UseMultipleCmdlistBarriers) { // Retain and save the resulting event for future commands. - PI_CALL(piEventRetain(*Event)); + (*Event)->RefCount.increment(); Queue->ActiveBarriers.push_back(*Event); } return PI_SUCCESS; @@ -6331,7 +6343,7 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, // Doing it early prevents potential additional barriers from implicitly being // appended. for (pi_event &E : Queue->ActiveBarriers) - PI_CALL(piEventRelease(E)); + PI_CALL(piEventReleaseInternal(E)); Queue->ActiveBarriers.clear(); // Get command lists for each command queue. @@ -6397,7 +6409,7 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, ConvergenceCmdList->second.isCopy(Queue))) return Res; for (pi_event &E : EventWaitVector) - PI_CALL(piEventRelease(E)); + PI_CALL(piEventReleaseInternal(E)); // Insert a barrier with the events from each command-queue into the // convergence command list. The resulting event signals the convergence of @@ -6418,7 +6430,7 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, return Res; // We must keep the event internally to use if new command lists are created. - PI_CALL(piEventRetain(*Event)); + (*Event)->RefCount.increment(); Queue->ActiveBarriers.push_back(*Event); return PI_SUCCESS; } @@ -6496,7 +6508,7 @@ pi_result _pi_queue::synchronize() { (ImmCmdList->first, zeEvent, 0, nullptr)); ZE_CALL(zeHostSynchronize, (zeEvent)); Event->Completed = true; - PI_CALL(piEventRelease(Event)); + PI_CALL(piEventReleaseInternal(Event)); return PI_SUCCESS; }; @@ -6517,7 +6529,7 @@ pi_result _pi_queue::synchronize() { // With the entire queue synchronized, the active barriers must be done so we // can remove them. for (pi_event &BarrierEvent : ActiveBarriers) - PI_CALL(piEventRelease(BarrierEvent)); + PI_CALL(piEventReleaseInternal(BarrierEvent)); ActiveBarriers.clear(); return PI_SUCCESS; @@ -7054,7 +7066,7 @@ pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Mem, pi_bool BlockingMap, // Add the event to the command list. if (Event) { CommandList->second.append(*Event); - PI_CALL(piEventRetain(*Event)); + (*Event)->RefCount.increment(); } const auto &ZeCommandList = CommandList->first; @@ -7170,7 +7182,7 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem Mem, void *MappedPtr, return Res; CommandList->second.append(*Event); - PI_CALL(piEventRetain(*Event)); + (*Event)->RefCount.increment(); const auto &ZeCommandList = CommandList->first; diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 41202e379e861..413e7164c7431 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -1377,9 +1377,23 @@ struct _pi_event : _pi_object { // being visible to the host at all. bool Completed = {false}; - // Indicates that event is internal, i.e. it is visible inside the L0 plugin - // only. - bool Internal = {false}; + // Besides each PI object keeping a total reference count in + // _pi_object::RefCount we keep special track of the event *external* + // references. This way we are able to tell when the event is not referenced + // externally anymore, i.e. it can't be passed as a dependency event to + // piEnqueue* functions and explicitly waited meaning that we can do some + // optimizations: + // 1. For in-order queues we can reset and reuse event if it was not yet + // completed by submitting a reset command to the queue (since there are no + // external references, we know that nobody can wait this event somewhere in + // parallel thread or pass it as a dependency which may lead to hang) + // 2. We can avoid creating host proxy event. + // This counter doesn't track the lifetime of a event object. Even if it + // reaches zero a event object may not be destroyed and can be used internally + // in the plugin. + std::atomic RefCountExternal{0}; + + bool hasExternalRefs() { return RefCountExternal != 0; } // Reset _pi_event object. pi_result reset(); diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index f413f9b72b419..61dfcbd5bcc8c 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -422,8 +422,20 @@ pi_result piQueueCreate(pi_context context, pi_device device, return cast(ret_err); } + // Check that unexpected bits are not set. + assert(!(properties & + ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | + PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE | + PI_QUEUE_ON_DEVICE_DEFAULT | PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS))); + + // Properties supported by OpenCL backend. + cl_command_queue_properties SupportByOpenCL = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE | + CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT; + cl_queue_properties CreationFlagProperties[] = { - CL_QUEUE_PROPERTIES, cast(properties), 0}; + CL_QUEUE_PROPERTIES, + cast(properties) & SupportByOpenCL, 0}; *queue = cast(clCreateCommandQueueWithProperties( cast(context), cast(device), CreationFlagProperties, &ret_err)); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index f2f4cd292c7b0..44949f5c88ae9 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -311,12 +311,11 @@ class queue_impl { ext::oneapi::cuda::property::queue::use_default_stream>()) { CreationFlags |= __SYCL_PI_CUDA_USE_DEFAULT_STREAM; } - if (getPlugin().getBackend() == backend::ext_oneapi_level_zero && - MPropList + if (MPropList .has_property()) { // Pass this flag to the Level Zero plugin to be able to check it from // queue property. - CreationFlags |= PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS_MODE_ENABLE; + CreationFlags |= PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS; } RT::PiQueue Queue{}; RT::PiContext Context = MContext->getHandleRef(); From 6fce5404b5d67ccc3ec63655ab65375cf10144b6 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 19 Aug 2022 15:25:46 -0700 Subject: [PATCH 6/6] Fix typos --- sycl/plugins/level_zero/pi_level_zero.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index a2526067d227e..1d35743f98b36 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -1359,14 +1359,14 @@ struct _pi_event : _pi_object { // externally anymore, i.e. it can't be passed as a dependency event to // piEnqueue* functions and explicitly waited meaning that we can do some // optimizations: - // 1. For in-order queues we can reset and reuse event if it was not yet + // 1. For in-order queues we can reset and reuse event even if it was not yet // completed by submitting a reset command to the queue (since there are no // external references, we know that nobody can wait this event somewhere in // parallel thread or pass it as a dependency which may lead to hang) // 2. We can avoid creating host proxy event. - // This counter doesn't track the lifetime of a event object. Even if it - // reaches zero a event object may not be destroyed and can be used internally - // in the plugin. + // This counter doesn't track the lifetime of an event object. Even if it + // reaches zero an event object may not be destroyed and can be used + // internally in the plugin. std::atomic RefCountExternal{0}; bool hasExternalRefs() { return RefCountExternal != 0; }