diff --git a/.github/workflows/build-hw-reusable.yml b/.github/workflows/build-hw-reusable.yml index 3c791db161..b53bec5705 100644 --- a/.github/workflows/build-hw-reusable.yml +++ b/.github/workflows/build-hw-reusable.yml @@ -112,7 +112,7 @@ jobs: - name: Test adapter specific working-directory: ${{github.workspace}}/build - run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" --timeout 180 + run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" -E "memcheck" --timeout 180 # Don't run adapter specific tests when building multiple adapters if: ${{ matrix.adapter.other_name == '' }} diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py index be48acce36..b3f403bfac 100644 --- a/scripts/benchmarks/benches/compute.py +++ b/scripts/benchmarks/benches/compute.py @@ -78,8 +78,9 @@ def benchmarks(self) -> list[Benchmark]: if options.ur is not None: benches += [ - SubmitKernelUR(self, 0), - SubmitKernelUR(self, 1), + SubmitKernelUR(self, 0, 0), + SubmitKernelUR(self, 1, 0), + SubmitKernelUR(self, 1, 1), ] return benches @@ -180,13 +181,14 @@ def bin_args(self) -> list[str]: ] class SubmitKernelUR(ComputeBenchmark): - def __init__(self, bench, ioq): + def __init__(self, bench, ioq, measureCompletion): self.ioq = ioq + self.measureCompletion = measureCompletion super().__init__(bench, "api_overhead_benchmark_ur", "SubmitKernel") def name(self): order = "in order" if self.ioq else "out of order" - return f"api_overhead_benchmark_ur SubmitKernel {order}" + return f"api_overhead_benchmark_ur SubmitKernel {order}" + (" with measure completion" if self.measureCompletion else "") def explicit_group(self): return "SubmitKernel" @@ -195,7 +197,7 @@ def bin_args(self) -> list[str]: return [ f"--Ioq={self.ioq}", "--DiscardEvents=0", - "--MeasureCompletion=0", + f"--MeasureCompletion={self.measureCompletion}", "--iterations=100000", "--Profiling=0", "--NumKernels=10", diff --git a/source/adapters/level_zero/v2/kernel.cpp b/source/adapters/level_zero/v2/kernel.cpp index dcf58d5b62..f770e07590 100644 --- a/source/adapters/level_zero/v2/kernel.cpp +++ b/source/adapters/level_zero/v2/kernel.cpp @@ -95,6 +95,9 @@ ur_kernel_handle_t_::ur_kernel_handle_t_( } ur_result_t ur_kernel_handle_t_::release() { + if (!RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + // manually release kernels to allow errors to be propagated for (auto &singleDeviceKernelOpt : deviceKernels) { if (singleDeviceKernelOpt.has_value()) { @@ -104,6 +107,8 @@ ur_result_t ur_kernel_handle_t_::release() { UR_CALL_THROWS(ur::level_zero::urProgramRelease(hProgram)); + delete this; + return UR_RESULT_SUCCESS; } @@ -362,13 +367,7 @@ ur_result_t urKernelRetain( ur_result_t urKernelRelease( ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to release ) try { - if (!hKernel->RefCount.decrementAndTest()) - return UR_RESULT_SUCCESS; - - hKernel->release(); - delete hKernel; - - return UR_RESULT_SUCCESS; + return hKernel->release(); } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index d97b4e39f9..79ec8f2b41 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -186,13 +186,25 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() { // Free deferred events for (auto &hEvent : deferredEvents) { - hEvent->releaseDeferred(); + UR_CALL(hEvent->releaseDeferred()); } deferredEvents.clear(); + // Free deferred kernels + for (auto &hKernel : submittedKernels) { + UR_CALL(hKernel->release()); + } + submittedKernels.clear(); + return UR_RESULT_SUCCESS; } +void ur_queue_immediate_in_order_t::recordSubmittedKernel( + ur_kernel_handle_t hKernel) { + submittedKernels.push_back(hKernel); + hKernel->RefCount.increment(); +} + ur_result_t ur_queue_immediate_in_order_t::queueFlush() { return UR_RESULT_SUCCESS; } @@ -251,6 +263,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch( (handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions, zeSignalEvent, waitList.second, waitList.first)); + recordSubmittedKernel(hKernel); + return UR_RESULT_SUCCESS; } @@ -1063,6 +1077,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( (handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions, zeSignalEvent, waitList.second, waitList.first)); + recordSubmittedKernel(hKernel); + return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 03fdbe0075..e0d7f747b3 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -47,6 +47,7 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { std::vector waitList; std::vector deferredEvents; + std::vector submittedKernels; std::pair getWaitListView(const ur_event_handle_t *phWaitEvents, @@ -82,6 +83,8 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + void recordSubmittedKernel(ur_kernel_handle_t hKernel); + public: ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t, const ur_queue_properties_t *); diff --git a/test/adapters/level_zero/v2/CMakeLists.txt b/test/adapters/level_zero/v2/CMakeLists.txt index df6b43c443..2bc53669c5 100644 --- a/test/adapters/level_zero/v2/CMakeLists.txt +++ b/test/adapters/level_zero/v2/CMakeLists.txt @@ -61,3 +61,27 @@ add_adapter_test(level_zero_memory_residency "UR_ADAPTERS_FORCE_LOAD=\"$\"" "ZES_ENABLE_SYSMAN=1" ) + +if(NOT WIN32) + add_adapter_test(level_zero_deferred_kernel + FIXTURE KERNELS + SOURCES + deferred_kernel.cpp + ENVIRONMENT + "UR_ADAPTERS_FORCE_LOAD=\"$\"" + ) + + set(backend level_zero) + add_adapter_memcheck_test(level_zero_deferred_kernel + FIXTURE KERNELS + SOURCES + deferred_kernel.cpp + ENVIRONMENT + "UR_ADAPTERS_FORCE_LOAD=\"$\"" + ) + + target_link_libraries(test-adapter-level_zero_deferred_kernel PRIVATE + LevelZeroLoader + LevelZeroLoader-Headers + ) +endif() diff --git a/test/adapters/level_zero/v2/deferred_kernel.cpp b/test/adapters/level_zero/v2/deferred_kernel.cpp new file mode 100644 index 0000000000..0f3c628ec1 --- /dev/null +++ b/test/adapters/level_zero/v2/deferred_kernel.cpp @@ -0,0 +1,166 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include "../../../conformance/enqueue/helpers.h" +#include "../ze_helpers.hpp" +#include "uur/fixtures.h" +#include "uur/raii.h" + +struct urEnqueueKernelLaunchTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "fill"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + } + + uint32_t val = 42; + size_t global_size = 32; + size_t global_offset = 0; + size_t n_dimensions = 1; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchTest); + +TEST_P(urEnqueueKernelLaunchTest, DeferredKernelRelease) { + ur_mem_handle_t buffer = nullptr; + AddBuffer1DArg(sizeof(val) * global_size, &buffer); + AddPodArg(val); + + auto zeEvent = createZeEvent(context, device); + + ur_event_handle_t event; + ASSERT_SUCCESS(urEventCreateWithNativeHandle( + reinterpret_cast(zeEvent.get()), context, nullptr, + &event)); + + ASSERT_SUCCESS(urEnqueueEventsWait(queue, 1, &event, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, nullptr, + 0, nullptr, nullptr)); + ASSERT_SUCCESS(urKernelRelease(kernel)); + + // Kernel should still be alive since kernel launch is pending + ur_context_handle_t contextFromKernel; + ASSERT_SUCCESS(urKernelGetInfo(kernel, UR_KERNEL_INFO_CONTEXT, + sizeof(ur_context_handle_t), + &contextFromKernel, nullptr)); + + ASSERT_EQ(context, contextFromKernel); + + ze_event_handle_t ze_event = nullptr; + ASSERT_SUCCESS(urEventGetNativeHandle( + event, reinterpret_cast(&ze_event))); + ASSERT_EQ(zeEventHostSignal(ze_event), ZE_RESULT_SUCCESS); + + ASSERT_SUCCESS(urQueueFinish(queue)); + + kernel = nullptr; + + ASSERT_SUCCESS(urEventRelease(event)); +} + +struct urMultiQueueLaunchKernelDeferFreeTest + : uur::urMultiQueueMultiDeviceTest<2> { + std::string KernelName; + + static constexpr char ProgramName[] = "foo"; + static constexpr size_t ArraySize = 100; + static constexpr uint32_t InitialValue = 1; + + ur_program_handle_t program = nullptr; + ur_kernel_handle_t kernel = nullptr; + + void SetUp() override { + if (devices.size() < 2) { + GTEST_SKIP() << "This test requires at least 2 devices"; + } + + UUR_RETURN_ON_FATAL_FAILURE( + uur::urMultiQueueMultiDeviceTest<2>::SetUp()); + + KernelName = uur::KernelsEnvironment::instance->GetEntryPointNames( + ProgramName)[0]; + + std::shared_ptr> il_binary; + std::vector metadatas{}; + + uur::KernelsEnvironment::instance->LoadSource(ProgramName, platform, + il_binary); + + const ur_program_properties_t properties = { + UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES, nullptr, + static_cast(metadatas.size()), + metadatas.empty() ? nullptr : metadatas.data()}; + + ASSERT_SUCCESS(urProgramCreateWithIL(context, il_binary->data(), + il_binary->size(), &properties, + &program)); + + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urProgramBuild(context, program, nullptr)); + ASSERT_SUCCESS(urKernelCreate(program, KernelName.data(), &kernel)); + } + + void TearDown() override { + // kernel will be release in the actual test + + urProgramRelease(program); + UUR_RETURN_ON_FATAL_FAILURE( + uur::urMultiQueueMultiDeviceTest<2>::TearDown()); + } +}; + +UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(urMultiQueueLaunchKernelDeferFreeTest); + +TEST_P(urMultiQueueLaunchKernelDeferFreeTest, Success) { + auto zeEvent1 = createZeEvent(context, devices[0]); + auto zeEvent2 = createZeEvent(context, devices[1]); + + ur_event_handle_t event1; + ASSERT_SUCCESS(urEventCreateWithNativeHandle( + reinterpret_cast(zeEvent1.get()), context, nullptr, + &event1)); + ur_event_handle_t event2; + ASSERT_SUCCESS(urEventCreateWithNativeHandle( + reinterpret_cast(zeEvent2.get()), context, nullptr, + &event2)); + + size_t global_offset = 0; + size_t global_size = 1; + + ASSERT_SUCCESS(urEnqueueEventsWait(queues[0], 1, &event1, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[0], kernel, 1, &global_offset, + &global_size, nullptr, 0, nullptr, + nullptr)); + + ASSERT_SUCCESS(urEnqueueEventsWait(queues[1], 1, &event2, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernel, 1, &global_offset, + &global_size, nullptr, 0, nullptr, + nullptr)); + + ASSERT_SUCCESS(urKernelRelease(kernel)); + + // Kernel should still be alive since both kernels are pending + ur_context_handle_t contextFromKernel; + ASSERT_SUCCESS(urKernelGetInfo(kernel, UR_KERNEL_INFO_CONTEXT, + sizeof(ur_context_handle_t), + &contextFromKernel, nullptr)); + ASSERT_EQ(context, contextFromKernel); + + ASSERT_EQ(zeEventHostSignal(zeEvent2.get()), ZE_RESULT_SUCCESS); + ASSERT_SUCCESS(urQueueFinish(queues[1])); + + // Kernel should still be alive since kernel launch is pending + ASSERT_SUCCESS(urKernelGetInfo(kernel, UR_KERNEL_INFO_CONTEXT, + sizeof(ur_context_handle_t), + &contextFromKernel, nullptr)); + ASSERT_EQ(context, contextFromKernel); + + ASSERT_EQ(zeEventHostSignal(zeEvent1.get()), ZE_RESULT_SUCCESS); + ASSERT_SUCCESS(urQueueFinish(queues[0])); + + ASSERT_SUCCESS(urEventRelease(event1)); + ASSERT_SUCCESS(urEventRelease(event2)); +} diff --git a/test/adapters/level_zero/v2/level_zero_deferred_kernel_memcheck.match b/test/adapters/level_zero/v2/level_zero_deferred_kernel_memcheck.match new file mode 100644 index 0000000000..74da8b0b7d --- /dev/null +++ b/test/adapters/level_zero/v2/level_zero_deferred_kernel_memcheck.match @@ -0,0 +1,2 @@ +{{IGNORE}} +{{.*}} ERROR SUMMARY: 0 errors from 0 contexts {{.*}}