diff --git a/sycl/doc/extensions/experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_native_command.asciidoc similarity index 72% rename from sycl/doc/extensions/experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc rename to sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_native_command.asciidoc index 0a84a95caae93..fe844463e0ad3 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_native_command.asciidoc @@ -1,4 +1,4 @@ -= sycl_ext_codeplay_enqueue_native_command += sycl_ext_oneapi_enqueue_native_command :source-highlighter: coderay :coderay-linenums-mode: table @@ -54,10 +54,10 @@ specification.* == Backend support status This extension is currently implemented in {dpcpp} only for GPU devices and -only when using the CUDA or HIP backends. Attempting to use this extension in -kernels that run on other devices or backends may result in undefined -behavior. Be aware that the compiler is not able to issue a diagnostic to -warn you if this happens. +only when using the Level Zero, CUDA, HIP backends. Attempting to use this +extension in kernels that run on other devices or backends may result in +undefined behavior. Be aware that the compiler is not able to issue a +diagnostic to warn you if this happens. == Overview @@ -66,7 +66,7 @@ This extension is derived from the experimental AdaptiveCpp extension, `enqueue_custom_operation` which is documented https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/enqueue-custom-operation.md[here]. -The goal of `ext_codeplay_enqueue_native_command` is to integrate interop +The goal of `ext_oneapi_enqueue_native_command` is to integrate interop work within the SYCL runtime's creation of the asynchronous SYCL DAG. As such, the user defined lambda must only enqueue asynchronous, as opposed to synchronous, backend work within the user lambda. Asynchronous work must only @@ -75,27 +75,30 @@ be submitted to the native queue obtained from === Differences with `host_task` -A callable submitted to `ext_codeplay_enqueue_native_command` won't wait +A callable submitted to `ext_oneapi_enqueue_native_command` won't wait on its dependent events to execute. The dependencies passed to an -`ext_codeplay_enqueue_native_command` submission will result in dependencies being +`ext_oneapi_enqueue_native_command` submission will result in dependencies being implicitly handled in the backend API, using the native queue object associated -with the SYCL queue that the `sycl_ext_codeplay_enqueue_native_command` is +with the SYCL queue that the `sycl_ext_oneapi_enqueue_native_command` is submitted to. This gives different synchronization guarantees from normal SYCL `host_task` s, which guarantee that the `host_task` callable will only begin execution once all of its dependent events have completed. In this example: -``` +```c++ q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_src{buf_src, cgh}; + sycl::accessor acc_dst{buf_dst, cgh}; cgh.depends_on(dep_event); - cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle &h) { - printf("This will print before dep_event has completed.\n"); - // This stream has been synchronized with dep_event's underlying - // hipEvent_t - hipStream_t stream = h.get_native_queue(); - hipMemcpyAsync(target_ptr, native_mem, test_size * sizeof(int), - hipMemcpyDeviceToHost, stream); + cgh.ext_oneapi_enqueue_native_command([=](sycl::interop_handle &h) { + printf("This may print before dep_event has completed.\n"); + auto src = IH.get_native_mem(acc_src); + auto dst = IH.get_native_mem(acc_dst); + auto command_list = h.get_native_queue(); + zeCommandListAppendMemoryCopy( + command_list, dst, src, sizeof(T) * acc_src.get_count(), nullptr, + 0, nullptr); }); }); q.wait() @@ -112,7 +115,7 @@ will only happen once the host task's dependent events are observed to be complete on the host. A SYCL event returned by a submission of a -`ext_codeplay_enqueue_native_command` command is only complete once the +`ext_oneapi_enqueue_native_command` command is only complete once the asynchronous work enqueued to the native queue obtained through `interop_handle::get_native_queue()` has completed. @@ -147,7 +150,7 @@ class: ```c++ class handler { template - void ext_codeplay_enqueue_native_command(Func&& interopCallable); + void ext_oneapi_enqueue_native_command(Func&& interopCallable); }; ``` @@ -156,7 +159,7 @@ parameter of type `interop_handle`. _Effects_: The `interopCallable` object is called exactly once, and this call may be made asynchronously even after the calling thread returns from -`ext_codeplay_enqueue_native_command`. +`ext_oneapi_enqueue_native_command`. The call to `interopCallable` may submit one or more asynchronous tasks to the native backend object obtained from `interop_handle::get_native_queue`, and @@ -181,6 +184,31 @@ any of these things, the behavior is undefined. == Example +```c++ +sycl::queue q; +q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_src{buf_src, cgh}; + sycl::accessor acc_dst{buf_dst, cgh}; + cgh.ext_oneapi_enqueue_native_command([=](sycl::interop_handle &h) { + // Can extract device mem from accessor + auto src = IH.get_native_mem(acc_src); + auto dst = IH.get_native_mem(acc_dst); + auto command_list = h.get_native_queue(); + + // Can enqueue arbitrary backend operations. This could also be a kernel + // launch or call to a library that enqueues operations on the command + // list etc. + // + // Important: Enqueuing a *synchronous* backend operation results in + // undefined behavior. + zeCommandListAppendMemoryCopy( + command_list, dst, src, sizeof(T) * acc_src.get_count(), nullptr, + 0, nullptr); + }); + }); +q.wait(); +``` + This example demonstrates how to use this extension to enqueue asynchronous native tasks on the HIP backend. @@ -189,14 +217,14 @@ sycl::queue q; q.submit([&](sycl::handler &cgh) { sycl::accessor acc{buf, cgh}; - cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle &h) { + cgh.ext_oneapi_enqueue_native_command([=](sycl::interop_handle &h) { // Can extract device pointers from accessors void *native_mem = h.get_native_mem(acc); // Can extract stream hipStream_t stream = h.get_native_queue(); // Can enqueue arbitrary backend operations. This could also be a kernel - // launch or call to a library that enqueues operations on the stream etc + // launch or call to a library that enqueues operations on the stream etc. // // Important: Enqueuing a *synchronous* backend operation results in // undefined behavior. @@ -211,7 +239,7 @@ q.wait(); === sycl_ext_oneapi_graph -`ext_codeplay_enqueue_native_command` +`ext_oneapi_enqueue_native_command` cannot be used in graph nodes. A synchronous exception will be thrown with error code `invalid` if a user tries to add them to a graph. diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index a39c1d1c1884b..3f733440c2b3e 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -1943,10 +1943,10 @@ The kernels loaded using link:../experimental/sycl_ext_oneapi_kernel_compiler_spirv.asciidoc[sycl_ext_oneapi_kernel_compiler_spirv] behave as normal when used in graph nodes. -==== sycl_ext_codeplay_enqueue_native_command +==== sycl_ext_oneapi_enqueue_native_command -`ext_codeplay_enqueue_native_command`, defined in -link:../experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc[sycl_ext_codeplay_enqueue_native_command] +`ext_oneapi_enqueue_native_command`, defined in +link:../experimental/sycl_ext_oneapi_enqueue_native_command.asciidoc[sycl_ext_oneapi_enqueue_native_command] cannot be used in graph nodes. A synchronous exception will be thrown with error code `invalid` if a user tries to add them to a graph. diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 9f613fc8ab038..1cd2d32aa1238 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -55,7 +55,7 @@ enum class UnsupportedGraphFeatures { sycl_ext_oneapi_device_global = 6, sycl_ext_oneapi_bindless_images = 7, sycl_ext_oneapi_experimental_cuda_cluster_launch = 8, - sycl_ext_codeplay_enqueue_native_command = 9 + sycl_ext_oneapi_enqueue_native_command = 9 }; inline const char * @@ -80,8 +80,8 @@ UnsupportedFeatureToString(UnsupportedGraphFeatures Feature) { return "sycl_ext_oneapi_bindless_images"; case UGF::sycl_ext_oneapi_experimental_cuda_cluster_launch: return "sycl_ext_oneapi_experimental_cuda_cluster_launch"; - case UGF::sycl_ext_codeplay_enqueue_native_command: - return "sycl_ext_codeplay_enqueue_native_command"; + case UGF::sycl_ext_oneapi_enqueue_native_command: + return "sycl_ext_oneapi_enqueue_native_command"; } assert(false && "Unhandled graphs feature"); diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 6f2e9f9fc19b7..117401d90f988 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1867,7 +1867,7 @@ class __SYCL_EXPORT handler { template std::enable_if_t, void(interop_handle)>::value> - ext_codeplay_enqueue_native_command_impl(FuncT &&Func) { + ext_oneapi_enqueue_native_command_impl(FuncT &&Func) { throwIfActionIsCreated(); // Need to copy these rather than move so that we can check associated @@ -2090,11 +2090,11 @@ class __SYCL_EXPORT handler { template std::enable_if_t, void(interop_handle)>::value> - ext_codeplay_enqueue_native_command(FuncT &&Func) { + ext_oneapi_enqueue_native_command(FuncT &&Func) { throwIfGraphAssociated< ext::oneapi::experimental::detail::UnsupportedGraphFeatures:: - sycl_ext_codeplay_enqueue_native_command>(); - ext_codeplay_enqueue_native_command_impl(Func); + sycl_ext_oneapi_enqueue_native_command>(); + ext_oneapi_enqueue_native_command_impl(Func); } /// Defines and invokes a SYCL kernel function for the specified range and diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index a713783887227..a9bc148358e1e 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -3196,7 +3196,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { detail::getSyclObjImpl(MQueue->get_device())->getHandleRef(), UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP, sizeof(NativeCommandSupport), &NativeCommandSupport, nullptr); - assert(NativeCommandSupport && "ext_codeplay_enqueue_native_command is not " + assert(NativeCommandSupport && "ext_oneapi_enqueue_native_command is not " "supported on this device"); MQueue->getPlugin()->call(urEnqueueNativeCommandExp, MQueue->getHandleRef(), InteropFreeFunc, &CustomOpData, ReqMems.size(), diff --git a/sycl/test-e2e/EnqueueNativeCommand/custom-command-cuda.cpp b/sycl/test-e2e/EnqueueNativeCommand/custom-command-cuda.cpp index ff73a763b2dae..eb86d71d5539c 100644 --- a/sycl/test-e2e/EnqueueNativeCommand/custom-command-cuda.cpp +++ b/sycl/test-e2e/EnqueueNativeCommand/custom-command-cuda.cpp @@ -51,7 +51,7 @@ void copy(buffer &Src, buffer &Dst, queue &Q) { if (Q.get_backend() != IH.get_backend()) throw; }; - CGH.ext_codeplay_enqueue_native_command(Func); + CGH.ext_oneapi_enqueue_native_command(Func); }); } @@ -85,7 +85,7 @@ void test_ht_buffer(queue &Q) { Q.submit([&](handler &CGH) { auto Acc = Buffer.get_access(CGH); auto Func = [=](interop_handle IH) { /*A no-op */ }; - CGH.ext_codeplay_enqueue_native_command(Func); + CGH.ext_oneapi_enqueue_native_command(Func); }); } diff --git a/sycl/test-e2e/EnqueueNativeCommand/custom-command-hip.cpp b/sycl/test-e2e/EnqueueNativeCommand/custom-command-hip.cpp index 8ffc9e7da8127..375219fc67d68 100644 --- a/sycl/test-e2e/EnqueueNativeCommand/custom-command-hip.cpp +++ b/sycl/test-e2e/EnqueueNativeCommand/custom-command-hip.cpp @@ -57,7 +57,7 @@ void copy(buffer &Src, buffer &Dst, queue &Q) { if (Q.get_backend() != IH.get_backend()) throw; }; - CGH.ext_codeplay_enqueue_native_command(Func); + CGH.ext_oneapi_enqueue_native_command(Func); }); } @@ -91,7 +91,7 @@ void test_ht_buffer(queue &Q) { Q.submit([&](handler &CGH) { auto Acc = Buffer.get_access(CGH); auto Func = [=](interop_handle IH) { /*A no-op */ }; - CGH.ext_codeplay_enqueue_native_command(Func); + CGH.ext_oneapi_enqueue_native_command(Func); }); } diff --git a/sycl/test-e2e/EnqueueNativeCommand/custom-command-level-zero.cpp b/sycl/test-e2e/EnqueueNativeCommand/custom-command-level-zero.cpp new file mode 100644 index 0000000000000..5d2d1710f5cf9 --- /dev/null +++ b/sycl/test-e2e/EnqueueNativeCommand/custom-command-level-zero.cpp @@ -0,0 +1,132 @@ +// RUN: %{build} %level_zero_options -o %t.out +// RUN: %{run} %t.out + +// REQUIRES: level_zero, level_zero_dev_kit + +#include + +#include +#include +#include +#include + +using namespace sycl; +using namespace sycl::access; + +static constexpr size_t BUFFER_SIZE = 1024; + +template class Modifier; + +template class Init; + +template +void checkBufferValues(BufferT Buffer, ValueT Value) { + auto Acc = Buffer.get_host_access(); + for (size_t Idx = 0; Idx < Acc.size(); ++Idx) { + if (Acc[Idx] != Value) { + std::cerr << "buffer[" << Idx << "] = " << Acc[Idx] + << ", expected val = " << Value << '\n'; + exit(1); + } + } +} + +template +void copy(buffer &Src, buffer &Dst, queue &Q) { + Q.submit([&](handler &CGH) { + auto SrcA = Src.template get_access(CGH); + auto DstA = Dst.template get_access(CGH); + + auto Func = [=](interop_handle IH) { + auto CommandList = IH.get_native_queue(); + auto SrcMem = IH.get_native_mem(SrcA); + auto DstMem = IH.get_native_mem(DstA); + + // If L0 interop becomes a real use case we should make a new UR entry + // point to propagate events into and out of the the interop func. + if (zeCommandListAppendMemoryCopy(CommandList, DstMem, SrcMem, + sizeof(DataT) * SrcA.size(), nullptr, 0, + nullptr) != ZE_RESULT_SUCCESS) + throw; + if (Q.get_backend() != IH.get_backend()) + throw; + }; + CGH.ext_oneapi_enqueue_native_command(Func); + }); +} + +template void modify(buffer &B, queue &Q) { + Q.submit([&](handler &CGH) { + auto Acc = B.template get_access(CGH); + + auto Kernel = [=](item<1> Id) { Acc[Id] += 1; }; + + CGH.parallel_for>(Acc.size(), Kernel); + }); +} + +template +void init(buffer &B1, buffer &B2, queue &Q) { + Q.submit([&](handler &CGH) { + auto Acc1 = B1.template get_access(CGH); + auto Acc2 = B2.template get_access(CGH); + + CGH.parallel_for>(BUFFER_SIZE, [=](item<1> Id) { + Acc1[Id] = B1Init; + Acc2[Id] = B2Init; + }); + }); +} + +// Check that a single host-interop-task with a buffer will work. +void test_ht_buffer(queue &Q) { + buffer Buffer{BUFFER_SIZE}; + + Q.submit([&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + auto Func = [=](interop_handle IH) { /*A no-op */ }; + CGH.ext_oneapi_enqueue_native_command(Func); + }); +} + +// A test that uses level_zero interop to copy data from buffer A to buffer B, +// by getting level_zero ptrs and calling the cuMemcpyWithAsync. Then run a SYCL +// kernel that modifies the data in place for B, e.g. increment one, then copy +// back to buffer A. Run it on a loop, to ensure the dependencies and the +// reference counting of the objects is not leaked. +void test_ht_kernel_dependencies(queue &Q) { + static constexpr int COUNT = 4; + buffer Buffer1{BUFFER_SIZE}; + buffer Buffer2{BUFFER_SIZE}; + + // Init the buffer with a'priori invalid data. + init(Buffer1, Buffer2, Q); + + // Repeat a couple of times. + for (size_t Idx = 0; Idx < COUNT; ++Idx) { + copy(Buffer1, Buffer2, Q); + modify(Buffer2, Q); + copy(Buffer2, Buffer1, Q); + } + + checkBufferValues(Buffer1, COUNT - 1); + checkBufferValues(Buffer2, COUNT - 1); +} + +void tests(queue &Q) { + test_ht_buffer(Q); + test_ht_kernel_dependencies(Q); +} + +int main() { + queue Q([](sycl::exception_list ExceptionList) { + if (ExceptionList.size() != 1) { + std::cerr << "Should be one exception in exception list" << std::endl; + std::abort(); + } + std::rethrow_exception(*ExceptionList.begin()); + }); + tests(Q); + std::cout << "Test PASSED" << std::endl; + return 0; +} diff --git a/sycl/test-e2e/EnqueueNativeCommand/custom-command-multiple-dev-cuda.cpp b/sycl/test-e2e/EnqueueNativeCommand/custom-command-multiple-dev-cuda.cpp index d4a1cb96101bc..a375b4abf7c35 100644 --- a/sycl/test-e2e/EnqueueNativeCommand/custom-command-multiple-dev-cuda.cpp +++ b/sycl/test-e2e/EnqueueNativeCommand/custom-command-multiple-dev-cuda.cpp @@ -35,7 +35,7 @@ int main() { Q.submit([&](handler &CGH) { accessor Acc{Buf, CGH, read_write}; - CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + CGH.ext_oneapi_enqueue_native_command([=](interop_handle IH) { auto Ptr = IH.get_native_mem(Acc); auto Stream = IH.get_native_queue(); int Tmp = 0; diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp index 054a663cebdd4..6dc4dbbeeda42 100644 --- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp @@ -406,12 +406,12 @@ TEST_F(CommandGraphTest, BindlessExceptionCheck) { sycl::free(ImgMemUSM, Ctxt); } -// ext_codeplay_enqueue_native_command isn't supported with SYCL graphs +// ext_oneapi_enqueue_native_command isn't supported with SYCL graphs TEST_F(CommandGraphTest, EnqueueCustomCommandCheck) { std::error_code ExceptionCode = make_error_code(sycl::errc::success); try { Graph.add([&](sycl::handler &CGH) { - CGH.ext_codeplay_enqueue_native_command([=](sycl::interop_handle IH) {}); + CGH.ext_oneapi_enqueue_native_command([=](sycl::interop_handle IH) {}); }); } catch (exception &Exception) { ExceptionCode = Exception.code();