From f2afed2c67fe0422743d9e84216744cb0dd799e4 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Mon, 22 Jul 2024 14:21:58 +0100 Subject: [PATCH 01/14] Try L0 impl for enqueue native command Draft impl for discussion. --- source/adapters/level_zero/enqueue_native.cpp | 54 +++++++- .../exp_enqueue_native/CMakeLists.txt | 17 +++ .../enqueue_native_level_zero.cpp | 128 ++++++++++++++++++ 3 files changed, 195 insertions(+), 4 deletions(-) create mode 100644 test/conformance/exp_enqueue_native/enqueue_native_level_zero.cpp diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index b67cccc4f1..a5afbd8d23 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -10,11 +10,57 @@ #include +#include "logger/ur_logger.hpp" #include "queue.hpp" +#include "ur_level_zero.hpp" ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( - ur_exp_enqueue_native_command_function_t, void *, uint32_t, - const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t, const ur_mem_handle_t *, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + auto Queue = this; + + // TODO: Do I need this lock? + std::scoped_lock Lock(Queue->Mutex); + + // TODO: What do I need to do with phMemList? Will a ur_mem_handle_t always + // be usable as a native arg from within pfnNativeEnqueue, or should some + // mem migration happen? + + bool UseCopyEngine = false; + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, phEventWaitList, Queue, UseCopyEngine)); + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, phEventWaitList, + true /* AllowBatching */)); + + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent{}; + bool IsInternal = phEvent == nullptr; + ur_event_handle_t *Event = phEvent ? phEvent : &InternalEvent; + + UR_CALL(createEventAndAssociateQueue(Queue, Event, + UR_COMMAND_ENQUEUE_NATIVE_EXP, + CommandList, IsInternal, false)); + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, phEventWaitList, + CommandList->second.ZeQueue)); + (*Event)->WaitList = TmpWaitList; + + // FIXME: blocking synchronization. Make this faster + Queue->queueFinish(); + + // Execute interop func + pfnNativeEnqueue(Queue, data); + + // FIXME: blocking synchronization. Make this faster + Queue->queueFinish(); + + return UR_RESULT_SUCCESS; } diff --git a/test/conformance/exp_enqueue_native/CMakeLists.txt b/test/conformance/exp_enqueue_native/CMakeLists.txt index 8638fa1349..8769cf716b 100644 --- a/test/conformance/exp_enqueue_native/CMakeLists.txt +++ b/test/conformance/exp_enqueue_native/CMakeLists.txt @@ -15,4 +15,21 @@ if (UR_BUILD_ADAPTER_CUDA) target_link_libraries(test-exp_enqueue_native PRIVATE cudadrv) endif() +if (UR_BUILD_ADAPTER_L0) + add_conformance_test_with_kernels_environment( + exp_enqueue_native + enqueue_native_level_zero.cpp + ) + target_link_libraries(test-exp_enqueue_native PRIVATE + LevelZeroLoader + LevelZeroLoader-Headers + ) + + target_include_directories(test-exp_enqueue_native PRIVATE + ${PROJECT_SOURCE_DIR}/source + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero + LevelZeroLoader-Headers + ) +endif() + # TODO: Add more tests for different triples diff --git a/test/conformance/exp_enqueue_native/enqueue_native_level_zero.cpp b/test/conformance/exp_enqueue_native/enqueue_native_level_zero.cpp new file mode 100644 index 0000000000..75dacebddb --- /dev/null +++ b/test/conformance/exp_enqueue_native/enqueue_native_level_zero.cpp @@ -0,0 +1,128 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "ze_api.h" + +#include +#include + +using T = uint32_t; + +struct urLevelZeroEnqueueNativeCommandTest : uur::urQueueTest { + void SetUp() { + UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTest::SetUp()); + + host_vec = std::vector(global_size, 0); + ASSERT_EQ(host_vec.size(), global_size); + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + allocation_size, &device_ptr)); + ASSERT_NE(device_ptr, nullptr); + } + static constexpr T val = 42; + static constexpr uint32_t global_size = 1e7; + std::vector host_vec; + void *device_ptr = nullptr; + static constexpr size_t allocation_size = sizeof(val) * global_size; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urLevelZeroEnqueueNativeCommandTest); + +struct InteropData1 { + void *fill_ptr; +}; + +// Fill a device ptr with the pattern val +void interop_func_1(ur_queue_handle_t hQueue, void *data) { + ze_command_list_handle_t CommandList; + ASSERT_SUCCESS(urQueueGetNativeHandle(hQueue, nullptr, + (ur_native_handle_t *)&CommandList)); + InteropData1 *func_data = reinterpret_cast(data); + + // If L0 interop becomes a real use case we should make a new UR entry point + // to propagate events into and out of the the interop func. + zeCommandListAppendMemoryFill( + CommandList, func_data->fill_ptr, + &urLevelZeroEnqueueNativeCommandTest::val, + sizeof(urLevelZeroEnqueueNativeCommandTest::val), + urLevelZeroEnqueueNativeCommandTest::allocation_size, nullptr, 0, + nullptr); +} + +struct InteropData2 { + void *from, *to; +}; + +// Read from device ptr to host ptr +void interop_func_2(ur_queue_handle_t hQueue, void *data) { + ze_command_list_handle_t CommandList; + ASSERT_SUCCESS(urQueueGetNativeHandle(hQueue, nullptr, + (ur_native_handle_t *)&CommandList)); + InteropData2 *func_data = reinterpret_cast(data); + + // If L0 interop becomes a real use case we should make a new UR entry point + // to propagate events into and out of the the interop func. + zeCommandListAppendMemoryCopy( + CommandList, func_data->to, func_data->from, + urLevelZeroEnqueueNativeCommandTest::allocation_size, nullptr, 0, + nullptr); +} + +TEST_P(urLevelZeroEnqueueNativeCommandTest, Success) { + InteropData1 data_1{device_ptr}; + ur_event_handle_t event_1; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1)); +} + +TEST_P(urLevelZeroEnqueueNativeCommandTest, Dependencies) { + ur_event_handle_t event_1, event_2; + + InteropData1 data_1{device_ptr}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1)); + + InteropData2 data_2{device_ptr, host_vec.data()}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 1, &event_1, &event_2)); + urQueueFinish(queue); + for (auto &i : host_vec) { + ASSERT_EQ(i, val); + } +} + +TEST_P(urLevelZeroEnqueueNativeCommandTest, DependenciesURBefore) { + ur_event_handle_t event_1, event_2; + + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptr, sizeof(val), &val, + allocation_size, 0, + nullptr /*phEventWaitList=*/, &event_1)); + + InteropData2 data_2{device_ptr, host_vec.data()}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 1, &event_1, &event_2)); + urQueueFinish(queue); + for (auto &i : host_vec) { + ASSERT_EQ(i, val); + } +} + +TEST_P(urLevelZeroEnqueueNativeCommandTest, DependenciesURAfter) { + ur_event_handle_t event_1; + + InteropData1 data_1{device_ptr}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1)); + + urEnqueueUSMMemcpy(queue, /*blocking*/ true, host_vec.data(), device_ptr, + allocation_size, 1, &event_1, nullptr); + for (auto &i : host_vec) { + ASSERT_EQ(i, val); + } +} From 7d14d84e758e27d2eea81d3279bc7f584b39b831 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Mon, 22 Jul 2024 16:46:49 +0100 Subject: [PATCH 02/14] Update entry point Thanks pbalcer for suggestion. --- source/adapters/level_zero/enqueue_native.cpp | 51 +++++++++++-------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index a5afbd8d23..132dd94f76 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -18,49 +18,56 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, - uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventList, ur_event_handle_t *phEvent) { auto Queue = this; - // TODO: Do I need this lock? - std::scoped_lock Lock(Queue->Mutex); - - // TODO: What do I need to do with phMemList? Will a ur_mem_handle_t always - // be usable as a native arg from within pfnNativeEnqueue, or should some - // mem migration happen? + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(Queue->Mutex); bool UseCopyEngine = false; + + // Please note that the following code should be run before the + // subsequent getAvailableCommandList() call so that there is no + // dead-lock from waiting unsubmitted events in an open batch. + // The createAndRetainUrZeEventList() has the proper side-effect + // of submitting batches with dependent events. + // _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, phEventWaitList, Queue, UseCopyEngine)); + NumEventsInWaitList, phEventList, Queue, UseCopyEngine)); // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; + // TODO: Change UseCopyEngine argument to 'true' once L0 backend + // support is added UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, NumEventsInWaitList, phEventWaitList, - true /* AllowBatching */)); + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, phEventList)); + // TODO: do we need to create a unique command type for this? ze_event_handle_t ZeEvent = nullptr; - ur_event_handle_t InternalEvent{}; + ur_event_handle_t InternalEvent; bool IsInternal = phEvent == nullptr; ur_event_handle_t *Event = phEvent ? phEvent : &InternalEvent; - - UR_CALL(createEventAndAssociateQueue(Queue, Event, - UR_COMMAND_ENQUEUE_NATIVE_EXP, + UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_USM_PREFETCH, CommandList, IsInternal, false)); - UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, - NumEventsInWaitList, phEventWaitList, - CommandList->second.ZeQueue)); + ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; - // FIXME: blocking synchronization. Make this faster - Queue->queueFinish(); - + const auto &WaitList = (*Event)->WaitList; + const auto &ZeCommandList = CommandList->first; + if (WaitList.Length) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); + } // Execute interop func pfnNativeEnqueue(Queue, data); - // FIXME: blocking synchronization. Make this faster - Queue->queueFinish(); + // TODO: Level Zero does not have a completion "event" with the prefetch API, + // so manually add command to signal our event. + ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); + + UR_CALL(Queue->executeCommandList(CommandList, false)); return UR_RESULT_SUCCESS; } From 8020612420f9e429ea2fa5058d57f60a36302134 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Tue, 23 Jul 2024 11:02:37 +0100 Subject: [PATCH 03/14] Add match files Add empty match files for level_zero. --- .../exp_enqueue_native_adapter_level_zero-v2.match | 0 .../exp_enqueue_native_adapter_level_zero.match | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero-v2.match create mode 100644 test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero.match diff --git a/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero-v2.match b/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero-v2.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero.match b/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero.match new file mode 100644 index 0000000000..e69de29bb2 From d76742eb68a5f9c6e03a79c4b0e4570debb3fcb0 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Wed, 24 Jul 2024 11:33:19 +0100 Subject: [PATCH 04/14] Use ScopedCommandList to get thread local CL Same as the CUDA implementation. This means that any CommandList obtained through urQueueGetNativeHandle will be the same CommmandList that is synchronized with before the interop func call. --- source/adapters/level_zero/device.cpp | 2 +- source/adapters/level_zero/enqueue_native.cpp | 1 + source/adapters/level_zero/queue.cpp | 9 +++++++ source/adapters/level_zero/queue.hpp | 26 +++++++++++++++++++ 4 files changed, 37 insertions(+), 1 deletion(-) diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 086c92b19a..80e18edb9c 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -877,7 +877,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( } case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { // L0 doesn't support enqueueing native work through the urNativeEnqueueExp - return ReturnValue(static_cast(false)); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_ESIMD_SUPPORT: { diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index 132dd94f76..652b0c8693 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -43,6 +43,7 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( // support is added UR_CALL(Queue->Context->getAvailableCommandList( Queue, CommandList, UseCopyEngine, NumEventsInWaitList, phEventList)); + ScopedCommandList Active{Queue, CommandList->first}; // TODO: do we need to create a unique command type for this? ze_event_handle_t ZeEvent = nullptr; diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 7498072d95..1020d04252 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -705,6 +705,15 @@ ur_result_t ur_queue_handle_legacy_t_::queueGetNativeHandle( ) { auto Queue = this; + // Needed for EnqueueNativeCommandExp, so that the native queue 'got' in the + // interop func is the as the native queue used to manage dependencies + // before the interop func invocation + if (Queue->getThreadLocalCommandList() != ze_command_list_handle_t{0}) { + auto ZeCmdList = ur_cast(NativeQueue); + *ZeCmdList = Queue->getThreadLocalCommandList(); + return UR_RESULT_SUCCESS; + } + // Lock automatically releases when this goes out of scope. std::shared_lock lock(Queue->Mutex); diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index 6e2444d2fa..8b6f325ef6 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -423,6 +423,12 @@ struct ur_queue_handle_legacy_t_ : _ur_object, public ur_queue_handle_t_ { uint32_t, const ur_event_handle_t *, ur_event_handle_t *) override; + // Thread local stream will be used if ScopedStream is active + static ze_command_list_handle_t &getThreadLocalCommandList() { + static thread_local ze_command_list_handle_t CommandList{0}; + return CommandList; + } + using queue_type = ur_device_handle_t_::queue_group_info_t::type; // PI queue is in general a one to many mapping to L0 native queues. struct ur_queue_group_t { @@ -941,3 +947,23 @@ ur_result_t setSignalEvent(ur_queue_handle_legacy_t Queue, bool UseCopyEngine, ur_result_t CleanupEventListFromResetCmdList( std::vector &EventListToCleanup, bool QueueLocked = false); + +// RAII object to make hQueue command list getter methods all return the same +// command list within the lifetime of this object. +// +// This is useful for urEnqueueNativeCommandExp where we want guarantees that +// the user submitted native calls will be dispatched to a known command list, +// which must be "got" within the user submitted fuction. +class ScopedCommandList { + ur_queue_handle_legacy_t hQueue; + +public: + ScopedCommandList(ur_queue_handle_legacy_t hQueue, + ze_command_list_handle_t CommandList) + : hQueue{hQueue} { + hQueue->getThreadLocalCommandList() = CommandList; + } + ~ScopedCommandList() { + hQueue->getThreadLocalCommandList() = ze_command_list_handle_t{0}; + } +}; From 7fbc58bdff68c865808c44566b5d79f0c334c98e Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Wed, 24 Jul 2024 11:35:26 +0100 Subject: [PATCH 05/14] Remove lock --- source/adapters/level_zero/enqueue_native.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index 652b0c8693..a420dc0e05 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -22,9 +22,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( ur_event_handle_t *phEvent) { auto Queue = this; - // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(Queue->Mutex); - bool UseCopyEngine = false; // Please note that the following code should be run before the From 245afb3fb76a885fc8b8c4b6ea64a8ae3eb5d4da Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Wed, 24 Jul 2024 12:33:29 +0100 Subject: [PATCH 06/14] Update source/adapters/level_zero/enqueue_native.cpp Co-authored-by: Piotr Balcer --- source/adapters/level_zero/enqueue_native.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index a420dc0e05..9769ccd291 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -47,7 +47,7 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( ur_event_handle_t InternalEvent; bool IsInternal = phEvent == nullptr; ur_event_handle_t *Event = phEvent ? phEvent : &InternalEvent; - UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_USM_PREFETCH, + UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_ENQUEUE_NATIVE_EXP, CommandList, IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; From 382325db576d9a684932b3dedfea9d0e3ef11c93 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Wed, 24 Jul 2024 12:43:42 +0100 Subject: [PATCH 07/14] Remove comment --- source/adapters/level_zero/enqueue_native.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index 9769ccd291..21530f9fdd 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -61,8 +61,6 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( // Execute interop func pfnNativeEnqueue(Queue, data); - // TODO: Level Zero does not have a completion "event" with the prefetch API, - // so manually add command to signal our event. ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); UR_CALL(Queue->executeCommandList(CommandList, false)); From 6111fb2c7a121e2280c96234dfd8e96c6afbbf0b Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Wed, 24 Jul 2024 12:46:37 +0100 Subject: [PATCH 08/14] For out of order queues call queue finish We can't use normal synchronization for out of order queues, so use brute force queueFinish. --- source/adapters/level_zero/enqueue_native.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index 21530f9fdd..3f58b0e949 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -40,14 +40,14 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( // support is added UR_CALL(Queue->Context->getAvailableCommandList( Queue, CommandList, UseCopyEngine, NumEventsInWaitList, phEventList)); - ScopedCommandList Active{Queue, CommandList->first}; // TODO: do we need to create a unique command type for this? ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; bool IsInternal = phEvent == nullptr; ur_event_handle_t *Event = phEvent ? phEvent : &InternalEvent; - UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_ENQUEUE_NATIVE_EXP, + UR_CALL(createEventAndAssociateQueue(Queue, Event, + UR_COMMAND_ENQUEUE_NATIVE_EXP, CommandList, IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -58,6 +58,16 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( ZE2UR_CALL(zeCommandListAppendWaitOnEvents, (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); } + + if (!isInOrderQueue()) { + queueFinish(); + // queue finish will execute the command list. Open it again so that + // `zeCommandListAppendSignalEvent` can be executed. + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, phEventList)); + } + ScopedCommandList Active{Queue, CommandList->first}; + // Execute interop func pfnNativeEnqueue(Queue, data); @@ -65,5 +75,9 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( UR_CALL(Queue->executeCommandList(CommandList, false)); + if (!isInOrderQueue()) { + queueFinish(); + } + return UR_RESULT_SUCCESS; } From 632ba6b6dbcf10c5ae9117d365d2fb30557b3df0 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Thu, 25 Jul 2024 13:57:48 +0100 Subject: [PATCH 09/14] Update matchfile --- .../exp_enqueue_native_adapter_level_zero-v2.match | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero-v2.match b/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero-v2.match index e69de29bb2..2c9b3a0f8d 100644 --- a/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero-v2.match +++ b/test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero-v2.match @@ -0,0 +1,4 @@ +urLevelZeroEnqueueNativeCommandTest.Success{{.*}} +urLevelZeroEnqueueNativeCommandTest.Dependencies{{.*}} +urLevelZeroEnqueueNativeCommandTest.DependenciesURBefore{{.*}} +urLevelZeroEnqueueNativeCommandTest.DependenciesURAfter{{.*}} From 5e1195eaf5ecbbf634966d5f5e00003101303dd2 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Thu, 25 Jul 2024 15:05:14 +0100 Subject: [PATCH 10/14] Update source/adapters/level_zero/enqueue_native.cpp Co-authored-by: Piotr Balcer --- source/adapters/level_zero/enqueue_native.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index 3f58b0e949..5f1d9dc4d9 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -71,13 +71,14 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( // Execute interop func pfnNativeEnqueue(Queue, data); - ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); - - UR_CALL(Queue->executeCommandList(CommandList, false)); - if (!isInOrderQueue()) { queueFinish(); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, phEventList)); } + ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); + + UR_CALL(Queue->executeCommandList(CommandList, false)); return UR_RESULT_SUCCESS; } From 071223f7046be5da0cec16fd635e03f2fdabcc97 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Mon, 29 Jul 2024 12:17:59 +0100 Subject: [PATCH 11/14] Add extra synchronization Enqueue things to L0 before calling queueFinish. --- source/adapters/level_zero/enqueue_native.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index 5f1d9dc4d9..88ec56053c 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -60,6 +60,7 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( } if (!isInOrderQueue()) { + UR_CALL(Queue->executeCommandList(CommandList, false)); queueFinish(); // queue finish will execute the command list. Open it again so that // `zeCommandListAppendSignalEvent` can be executed. From 352015fa7a22afad021361b47ac5a687c0c0f897 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Mon, 29 Jul 2024 14:36:11 +0100 Subject: [PATCH 12/14] Update comment Clarify wording in comment. --- source/adapters/level_zero/enqueue_native.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index 88ec56053c..66852575c8 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -69,7 +69,7 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( } ScopedCommandList Active{Queue, CommandList->first}; - // Execute interop func + // Call interop func which enqueues native async work pfnNativeEnqueue(Queue, data); if (!isInOrderQueue()) { From 1528f4c5348328b580f14b8bb91af2764f2c22b9 Mon Sep 17 00:00:00 2001 From: Piotr Balcer Date: Tue, 30 Jul 2024 10:42:58 +0200 Subject: [PATCH 13/14] fix ordering between operations in native enqueue --- source/adapters/level_zero/enqueue_native.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index 66852575c8..ccd9c97cdf 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -21,6 +21,7 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventList, ur_event_handle_t *phEvent) { auto Queue = this; + std::scoped_lock lock(Queue->Mutex); bool UseCopyEngine = false; @@ -60,12 +61,9 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( } if (!isInOrderQueue()) { - UR_CALL(Queue->executeCommandList(CommandList, false)); - queueFinish(); - // queue finish will execute the command list. Open it again so that - // `zeCommandListAppendSignalEvent` can be executed. + UR_CALL(Queue->executeCommandList(CommandList, false, false)); UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, NumEventsInWaitList, phEventList)); + Queue, CommandList, UseCopyEngine, 0, nullptr)); } ScopedCommandList Active{Queue, CommandList->first}; @@ -73,9 +71,9 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( pfnNativeEnqueue(Queue, data); if (!isInOrderQueue()) { - queueFinish(); + UR_CALL(Queue->executeCommandList(CommandList, false, false)); UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, NumEventsInWaitList, phEventList)); + Queue, CommandList, UseCopyEngine, 0, nullptr)); } ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); From 716ee15bc2389ff0bc01c1b014ce2ac62ae82a24 Mon Sep 17 00:00:00 2001 From: Piotr Balcer Date: Tue, 30 Jul 2024 11:00:24 +0200 Subject: [PATCH 14/14] always execute the command list between ops in native enqueue --- source/adapters/level_zero/enqueue_native.cpp | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index ccd9c97cdf..cc646a2cc2 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -54,29 +54,27 @@ ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp( (*Event)->WaitList = TmpWaitList; const auto &WaitList = (*Event)->WaitList; - const auto &ZeCommandList = CommandList->first; if (WaitList.Length) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); + (CommandList->first, WaitList.Length, WaitList.ZeEventList)); } - if (!isInOrderQueue()) { - UR_CALL(Queue->executeCommandList(CommandList, false, false)); - UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, 0, nullptr)); - } - ScopedCommandList Active{Queue, CommandList->first}; + UR_CALL(Queue->executeCommandList(CommandList, false, false)); + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + UseCopyEngine, 0, nullptr)); - // Call interop func which enqueues native async work - pfnNativeEnqueue(Queue, data); + { + ScopedCommandList Active{Queue, CommandList->first}; - if (!isInOrderQueue()) { - UR_CALL(Queue->executeCommandList(CommandList, false, false)); - UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, 0, nullptr)); + // Call interop func which enqueues native async work + pfnNativeEnqueue(Queue, data); } - ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); + UR_CALL(Queue->executeCommandList(CommandList, false, false)); + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + UseCopyEngine, 0, nullptr)); + + ZE2UR_CALL(zeCommandListAppendSignalEvent, (CommandList->first, ZeEvent)); UR_CALL(Queue->executeCommandList(CommandList, false)); return UR_RESULT_SUCCESS;