diff --git a/sycl/include/sycl/backend.hpp b/sycl/include/sycl/backend.hpp index 0f8ebd75c4d9b..5ddbaa77b658d 100644 --- a/sycl/include/sycl/backend.hpp +++ b/sycl/include/sycl/backend.hpp @@ -214,6 +214,22 @@ get_native(const device &Obj) { Obj.getNative()); } +template <> +inline backend_return_t +get_native(const event &Event) { + if (Event.get_backend() != backend::ext_oneapi_cuda) { + throw sycl::exception(make_error_code(errc::backend_mismatch), + "Backends mismatch"); + } + backend_return_t ReturnValue; + for (auto const &element : Event.getNativeVector()) { + ReturnValue.push_back( + reinterpret_cast::value_type>(element)); + } + return ReturnValue; +} + #ifndef SYCL_EXT_ONEAPI_BACKEND_CUDA_EXPERIMENTAL template <> __SYCL_DEPRECATED( @@ -248,6 +264,22 @@ inline backend_return_t get_native< Obj.getNative()); } +template <> +inline backend_return_t +get_native(const event &Event) { + if (Event.get_backend() != backend::ext_oneapi_hip) { + throw sycl::exception(make_error_code(errc::backend_mismatch), + "Backends mismatch"); + } + backend_return_t ReturnValue; + for (auto const &element : Event.getNativeVector()) { + ReturnValue.push_back( + reinterpret_cast::value_type>(element)); + } + return ReturnValue; +} + #endif // SYCL_EXT_ONEAPI_BACKEND_HIP template struct interop { }; template <> struct interop { - using type = CUevent; + using type = std::vector; + using value_type = CUevent; }; template <> struct interop { @@ -90,7 +91,8 @@ template <> struct BackendInput { }; template <> struct BackendReturn { - using type = CUevent; + using type = std::vector; + using value_type = CUevent; }; template <> struct BackendInput { diff --git a/sycl/include/sycl/detail/backend_traits_hip.hpp b/sycl/include/sycl/detail/backend_traits_hip.hpp index eef21a7f413b2..b432472d1caa7 100644 --- a/sycl/include/sycl/detail/backend_traits_hip.hpp +++ b/sycl/include/sycl/detail/backend_traits_hip.hpp @@ -44,7 +44,8 @@ template <> struct interop { }; template <> struct interop { - using type = HIPevent; + using type = std::vector; + using value_type = HIPevent; }; template <> struct interop { @@ -84,7 +85,8 @@ template <> struct BackendInput { }; template <> struct BackendReturn { - using type = HIPevent; + using type = std::vector; + using value_type = HIPevent; }; template <> struct BackendInput { diff --git a/sycl/include/sycl/detail/cg_types.hpp b/sycl/include/sycl/detail/cg_types.hpp index 9da1f0b664d46..5ceaf71c05d54 100644 --- a/sycl/include/sycl/detail/cg_types.hpp +++ b/sycl/include/sycl/detail/cg_types.hpp @@ -16,6 +16,7 @@ #include // for kernel_param_kind_t #include // for PI_ERROR_INVALID_WORK... #include // for nd_range_error +#include // for property::host_task::manual_interop_sync #include // for group #include // for h_item #include // for id diff --git a/sycl/include/sycl/detail/host_task_impl.hpp b/sycl/include/sycl/detail/host_task_impl.hpp index 1634269ac2f2b..990f03eb4129c 100644 --- a/sycl/include/sycl/detail/host_task_impl.hpp +++ b/sycl/include/sycl/detail/host_task_impl.hpp @@ -24,14 +24,23 @@ namespace detail { class HostTask { std::function MHostTask; std::function MInteropTask; + bool ManualInteropSync = false; public: HostTask() : MHostTask([]() {}) {} HostTask(std::function &&Func) : MHostTask(Func) {} HostTask(std::function &&Func) : MInteropTask(Func) {} + HostTask(std::function &&Func, + const property_list PropList) + : MInteropTask(Func), + ManualInteropSync{ + PropList.has_property()} {} bool isInteropTask() const { return !!MInteropTask; } + bool isManualInteropSync() const { return ManualInteropSync; } + void call(HostProfilingInfo *HPI) { if (HPI) HPI->start(); @@ -74,7 +83,7 @@ std::enable_if_t< detail::check_fn_signature, void()>::value || detail::check_fn_signature, void(interop_handle)>::value> -handler::host_task_impl(FuncT &&Func) { +handler::host_task_impl(FuncT &&Func, const property_list &PropList) { throwIfActionIsCreated(); MNDRDesc.set(range<1>(1)); @@ -82,7 +91,11 @@ handler::host_task_impl(FuncT &&Func) { // accessors during finalize MArgs = MAssociatedAccesors; - MHostTask.reset(new detail::HostTask(std::move(Func))); + if constexpr (detail::check_fn_signature, + void(interop_handle)>::value) + MHostTask.reset(new detail::HostTask(std::move(Func), std::move(PropList))); + else + MHostTask.reset(new detail::HostTask(std::move(Func))); setType(detail::CG::CodeplayHostTask); } diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp index 544e776a21e1e..8eb77b1964b7c 100644 --- a/sycl/include/sycl/detail/property_helper.hpp +++ b/sycl/include/sycl/detail/property_helper.hpp @@ -49,8 +49,9 @@ enum DataLessPropKind { GraphDependOnAllLeaves = 24, GraphUpdatable = 25, GraphEnableProfiling = 26, + HostTaskManualInteropSync = 27, // Indicates the last known dataless property. - LastKnownDataLessPropKind = 26, + LastKnownDataLessPropKind = 27, // Exceeding 32 may cause ABI breaking change on some of OSes. DataLessPropKindSize = 32 }; diff --git a/sycl/include/sycl/ext/codeplay/experimental/host_task_properties.hpp b/sycl/include/sycl/ext/codeplay/experimental/host_task_properties.hpp new file mode 100644 index 0000000000000..cbc52ded6124c --- /dev/null +++ b/sycl/include/sycl/ext/codeplay/experimental/host_task_properties.hpp @@ -0,0 +1,38 @@ +//==-------- host_task_properties.hpp --- SYCL host task properties --------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +namespace sycl { +inline namespace _V1 { +namespace ext::codeplay::experimental::property::host_task { + +class manual_interop_sync : public ::sycl::detail::DataLessProperty< + ::sycl::detail::HostTaskManualInteropSync> {}; + +} // namespace ext::codeplay::experimental::property::host_task + +// Forward declaration +class host_task; + +template <> +struct is_property< + ext::codeplay::experimental::property::host_task::manual_interop_sync> + : std::true_type {}; + +template <> +struct is_property_of< + ext::codeplay::experimental::property::host_task::manual_interop_sync, + host_task> : std::true_type {}; + +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp index 0dedc6648fd44..610ef5a7b84c4 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp @@ -48,7 +48,8 @@ template <> struct interop { }; template <> struct interop { - using type = CUevent; + using type = std::vector; + using value_type = CUevent; }; template <> struct interop { @@ -92,7 +93,8 @@ template <> struct BackendInput { }; template <> struct BackendReturn { - using type = CUevent; + using type = std::vector; + using value_type = CUevent; }; template <> struct BackendInput { diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp index 9d01c37691f33..3460bb73621aa 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp @@ -28,7 +28,8 @@ inline __SYCL_EXPORT device make_device(pi_native_handle NativeHandle) { // Implementation of cuda::has_native_event inline __SYCL_EXPORT bool has_native_event(event sycl_event) { if (sycl_event.get_backend() == backend::ext_oneapi_cuda) - return get_native(sycl_event) != nullptr; + return get_native(sycl_event).size() && + get_native(sycl_event).front() != nullptr; return false; } diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index a71f5400a813d..41e2fca836787 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1866,7 +1866,7 @@ class __SYCL_EXPORT handler { void()>::value || detail::check_fn_signature, void(interop_handle)>::value> - host_task_impl(FuncT &&Func); + host_task_impl(FuncT &&Func, const property_list &); /// @brief Get the command graph if any associated with this handler. It can /// come from either the associated queue or from being set explicitly through @@ -2067,8 +2067,8 @@ class __SYCL_EXPORT handler { void()>::value || detail::check_fn_signature, void(interop_handle)>::value> - host_task(FuncT &&Func) { - host_task_impl(Func); + host_task(FuncT &&Func, const property_list PropList = {}) { + host_task_impl(Func, PropList); } /// Defines and invokes a SYCL kernel function for the specified range and diff --git a/sycl/include/sycl/interop_handle.hpp b/sycl/include/sycl/interop_handle.hpp index 9839ffd58a6cb..d87b9cb5e0567 100644 --- a/sycl/include/sycl/interop_handle.hpp +++ b/sycl/include/sycl/interop_handle.hpp @@ -39,6 +39,11 @@ class DispatchHostTask; class queue_impl; class device_impl; class context_impl; + +// Needed for get_native_events to check if backend_return_t is +// a vector +template struct is_std_vector : std::false_type {}; +template struct is_std_vector> : std::true_type {}; } // namespace detail class queue; @@ -182,6 +187,66 @@ class interop_handle { #endif } + // Adds events from the native API so that the SYCL runtime can integrate the + // async calls in a native API, with other async operations in the SYCL DAG. + // Using this function removes the requirement that a host task callable must + // synchronize with any asynchronous operations from within the callable. + template + void add_native_events(backend_return_t NativeEvents) { +#ifndef __SYCL_DEVICE_ONLY__ + // TODO: replace the exception thrown below with the SYCL 2020 exception + // with the error code 'errc::backend_mismatch' when those new exceptions + // are ready to be used. + if (Backend != get_backend()) + throw invalid_object_error("Incorrect backend argument was passed", + PI_ERROR_INVALID_MEM_OBJECT); + // All native events can be cast to void*, we use this as a generic entry + // point to source library + std::vector NativeEventHolders(NativeEvents.size()); + for (auto i = 0; i < NativeEvents.size(); ++i) + NativeEventHolders[i] = + reinterpret_cast(NativeEvents[i]); + return addNativeEvents(NativeEventHolders); +#else + // we believe this won't be ever called on device side + return; +#endif + } + + // Gets all the native events that the host task depends on, and that are + // still active + template + backend_return_t get_native_events() { +#ifndef __SYCL_DEVICE_ONLY__ + // TODO: replace the exception thrown below with the SYCL 2020 exception + // with the error code 'errc::backend_mismatch' when those new exceptions + // are ready to be used. + if (Backend != get_backend()) + throw invalid_object_error("Incorrect backend argument was passed", + PI_ERROR_INVALID_MEM_OBJECT); + // All native events can be cast to void*, we use this as a generic entry + // point to source library + std::vector NativeEventHolders = getNativeEvents(); + backend_return_t + RetNativeEvents; // This may be a vector of native events or a single + // native event, depending on the backend + if constexpr (detail::is_std_vector< + backend_return_t>::value) { + using ValueT = typename backend_return_t::value_type; + for (auto i = 0; i < NativeEventHolders.size(); ++i) + RetNativeEvents.push_back( + reinterpret_cast(NativeEventHolders[i])); + } else { + RetNativeEvents = reinterpret_cast>( + NativeEventHolders[0]); + } + return RetNativeEvents; +#else + // we believe this won't be ever called on device side + return {}; +#endif + } + private: friend class detail::ExecCGCommand; friend class detail::DispatchHostTask; @@ -190,8 +255,9 @@ class interop_handle { interop_handle(std::vector MemObjs, const std::shared_ptr &Queue, const std::shared_ptr &Device, - const std::shared_ptr &Context) - : MQueue(Queue), MDevice(Device), MContext(Context), + const std::shared_ptr &Context, + const std::shared_ptr &Event) + : MQueue(Queue), MDevice(Device), MContext(Context), MEvent(Event), MMemObjs(std::move(MemObjs)) {} template @@ -215,10 +281,13 @@ class interop_handle { getNativeQueue(int32_t &NativeHandleDesc) const; __SYCL_EXPORT pi_native_handle getNativeDevice() const; __SYCL_EXPORT pi_native_handle getNativeContext() const; + __SYCL_EXPORT void addNativeEvents(std::vector &); + __SYCL_EXPORT std::vector getNativeEvents() const; std::shared_ptr MQueue; std::shared_ptr MDevice; std::shared_ptr MContext; + std::shared_ptr MEvent; std::vector MMemObjs; }; diff --git a/sycl/include/sycl/properties/all_properties.hpp b/sycl/include/sycl/properties/all_properties.hpp index 6f42500ea6b38..b6f3355b6d65a 100644 --- a/sycl/include/sycl/properties/all_properties.hpp +++ b/sycl/include/sycl/properties/all_properties.hpp @@ -9,6 +9,7 @@ #pragma once #include +#include #include #include #include diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index b1ef3f71ae170..f11fd2ecc8bd9 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -99,14 +99,8 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit 1e9b1b493fe30e6236bf611ae6d82366c9376f6c - # Merge: a011f092 d8500a36 - # Author: Kenneth Benzie (Benie) - # Date: Fri Jun 21 10:22:52 2024 +0100 - # Merge pull request #805 from aarongreig/aaron/kernelSetArgIndirectionFix - # Correct level of indirection used in KernelSetArgPointer calls. - set(UNIFIED_RUNTIME_TAG 1e9b1b493fe30e6236bf611ae6d82366c9376f6c) + set(UNIFIED_RUNTIME_REPO "https://github.com/hdelan/unified-runtime.git") + set(UNIFIED_RUNTIME_TAG interop-event-check) fetch_adapter_source(level_zero ${UNIFIED_RUNTIME_REPO} diff --git a/sycl/source/detail/backend_impl.hpp b/sycl/source/detail/backend_impl.hpp index ca23ceb48815c..37add4e8b43e3 100644 --- a/sycl/source/detail/backend_impl.hpp +++ b/sycl/source/detail/backend_impl.hpp @@ -9,13 +9,22 @@ #pragma once #include #include +#include namespace sycl { inline namespace _V1 { namespace detail { template backend getImplBackend(const T &Impl) { - assert(!Impl->is_host() && "Cannot get the backend for host."); + // Experimental host task allows the user to get backend for event impls + if constexpr (std::is_same_v>) { + assert((!Impl->is_host() || Impl->backendSet()) && + "interop_handle::add_native_events must be " + "used in order for a host " + "task event to have a native event"); + } else { + assert(!Impl->is_host() && "Cannot get the backend for host."); + } return Impl->getContextImplPtr()->getBackend(); } diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index c7d245e5e91c0..710111a1daf34 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -82,6 +82,9 @@ void event_impl::waitInternal(bool *Success) { "waitInternal method cannot be used for a discarded event."); } else if (MState != HES_Complete) { // Wait for the host event + // In the case that the Host Task function stores native events with + // add_native_events, waitInternal will only wait on the lambda to complete, + // not on the asynchronous events std::unique_lock lock(MMutex); cv.wait(lock, [this] { return MState == HES_Complete; }); } @@ -91,6 +94,14 @@ void event_impl::waitInternal(bool *Success) { Event->wait(Event); } +void event_impl::waitForHostTaskNativeEvents() { + std::unique_lock Lock(MHostTaskNativeEventsMutex); + if (MHostTaskNativeEventsHaveBeenWaitedOn.exchange(true)) + return; + for (const EventImplPtr &Event : MHostTaskNativeEvents) + Event->wait(Event); +} + void event_impl::setComplete() { if (MHostEvent || !MEvent) { { @@ -265,6 +276,9 @@ void event_impl::wait(std::shared_ptr Self, else if (MCommand) detail::Scheduler::getInstance().waitForEvent(Self, Success); + if (MHostTaskNativeEvents.size()) + waitForHostTaskNativeEvents(); + #ifdef XPTI_ENABLE_INSTRUMENTATION instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); #endif @@ -483,11 +497,34 @@ pi_native_handle event_impl::getNative() { } if (MContext->getBackend() == backend::opencl) Plugin->call(getHandleRef()); - pi_native_handle Handle; - Plugin->call(getHandleRef(), &Handle); + pi_native_handle Handle = 0; + if (auto HandleRef = getHandleRef()) + Plugin->call(HandleRef, &Handle); return Handle; } +std::vector event_impl::getNativeVector() { + // Return empty vec if native events have already been waited on + if (isCompleted() && (!hasHostTaskNativeEvents() || + MHostTaskNativeEventsHaveBeenWaitedOn.load())) + return {}; + + // If there is a native event return that. This will also initialize context + if (auto nativeEvent = getNative()) + return {nativeEvent}; + + // Return native events submitted via host task interop + auto Plugin = getPlugin(); + std::vector HandleVec; + for (auto &HostTaskNativeEventImpl : MHostTaskNativeEvents) { + pi_native_handle Handle; + Plugin->call( + HostTaskNativeEventImpl->MEvent, &Handle); + HandleVec.push_back(Handle); + } + return HandleVec; +} + std::vector event_impl::getWaitList() { if (MState == HES_Discarded) throw sycl::exception( @@ -502,6 +539,8 @@ std::vector event_impl::getWaitList() { MPreparedDepsEvents.end()); Result.insert(Result.end(), MPreparedHostDepsEvents.begin(), MPreparedHostDepsEvents.end()); + Result.insert(Result.end(), MHostTaskNativeEvents.begin(), + MHostTaskNativeEvents.end()); return Result; } diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 91bef738450d3..af87b074268f9 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -8,6 +8,7 @@ #pragma once +#include // For context_impl::setPlugin #include #include #include @@ -131,6 +132,9 @@ class event_impl { /// it's pointing to is then set according to the outcome. void waitInternal(bool *Success = nullptr); + /// Waits for the event with respect to device type. + void waitForHostTaskNativeEvents(); + /// Marks this event as completed. void setComplete(); @@ -189,6 +193,11 @@ class event_impl { /// \return a native handle. pi_native_handle getNative(); + /// Gets the vector of native handles associated with the SYCL event. + /// + /// \return a native handle. + std::vector getNativeVector(); + /// Returns vector of event dependencies. /// /// @return a reference to MPreparedDepsEvents. @@ -321,6 +330,28 @@ class event_impl { return MEventFromSubmittedExecCommandBuffer; } + void addHostTaskNativeEvent(EventImplPtr Event) { + // We need to keep track of which native events refer to + // this one host task event, we do that so we can correctly + // return the vec of native events when calling get_native + MHostTaskNativeEvents.push_back(Event); + // We also add to the PreparedDepsEvents so that we can do standard + // dependency analysis + MPreparedDepsEvents.push_back(Event); + } + + bool hasHostTaskNativeEvents() const { + return MHostTaskNativeEvents.size() > 0; + } + + bool backendSet() const { + return !MContext->is_host() || hasHostTaskNativeEvents(); + } + + const std::vector &getHostTaskNativeEvents() const { + return MHostTaskNativeEvents; + } + void setProfilingEnabled(bool Value) { MIsProfilingEnabled = Value; } // Sets a command-buffer command when this event represents an enqueue to a @@ -379,6 +410,12 @@ class event_impl { std::vector MPostCompleteEvents; + // Used to hold pi_events for native events that are stored with + // interop_handle::add_native_events + std::vector MHostTaskNativeEvents; + std::atomic MHostTaskNativeEventsHaveBeenWaitedOn = false; + std::mutex MHostTaskNativeEventsMutex; + /// Indicates that the task associated with this event has been submitted by /// the queue to the device. std::atomic MIsFlushed = false; diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp index 34537c7191af6..d609c8b7ac5c3 100644 --- a/sycl/source/detail/platform_impl.hpp +++ b/sycl/source/detail/platform_impl.hpp @@ -141,7 +141,8 @@ class platform_impl { // \return the Plugin associated with this platform. const PluginPtr &getPlugin() const { - assert(!MHostPlatform && "Plugin is not available for Host."); + assert((!MHostPlatform || MBackendSet) && + "Plugin is not available for Host."); return MPlugin; } @@ -241,6 +242,7 @@ class platform_impl { bool MHostPlatform = false; sycl::detail::pi::PiPlatform MPlatform = 0; backend MBackend; + bool MBackendSet = !MHostPlatform; PluginPtr MPlugin; std::vector> MDeviceCache; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index a164c455fed54..c87f98ef35e1b 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -320,8 +320,12 @@ static void flushCrossQueueDeps(const std::vector &EventImpls, class DispatchHostTask { ExecCGCommand *MThisCmd; std::vector MReqToMem; + EventImplPtr MEvent; // If we want to add native events within the host task + // using the interop_handle then we need to have a ptr + // to the event impl of the sycl event returned at + // CGSubmit - pi_result waitForEvents() const { + pi_result waitForNativeDepEvents() const { std::map> RequiredEventsPerPlugin; @@ -341,8 +345,9 @@ class DispatchHostTask { if (RawEvents.size() == 0) continue; try { - PluginWithEvents.first->call(RawEvents.size(), - RawEvents.data()); + if (RawEvents.size()) + PluginWithEvents.first->call( + RawEvents.size(), RawEvents.data()); } catch (const sycl::exception &E) { CGHostTask &HostTask = static_cast(MThisCmd->getCG()); HostTask.MQueue->reportAsyncException(std::current_exception()); @@ -358,6 +363,8 @@ class DispatchHostTask { // Host events can't throw exceptions so don't try to catch it. for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) { Event->waitInternal(); + if (Event->hasHostTaskNativeEvents()) + Event->waitForHostTaskNativeEvents(); } return PI_SUCCESS; @@ -365,8 +372,9 @@ class DispatchHostTask { public: DispatchHostTask(ExecCGCommand *ThisCmd, - std::vector ReqToMem) - : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)) {} + std::vector ReqToMem, + EventImplPtr MEvent) + : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)), MEvent(MEvent) {} void operator()() const { assert(MThisCmd->getCG().getType() == CG::CGTYPE::CodeplayHostTask); @@ -385,16 +393,18 @@ class DispatchHostTask { } #endif - pi_result WaitResult = waitForEvents(); - if (WaitResult != PI_SUCCESS) { - std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error( - std::string("Couldn't wait for host-task's dependencies"), - WaitResult)); - HostTask.MQueue->reportAsyncException(EPtr); - // reset host-task's lambda and quit - HostTask.MHostTask.reset(); - Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd); - return; + if (!HostTask.MHostTask->isManualInteropSync()) { + pi_result WaitResult = waitForNativeDepEvents(); + if (WaitResult != PI_SUCCESS) { + std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error( + std::string("Couldn't wait for host-task's dependencies"), + WaitResult)); + HostTask.MQueue->reportAsyncException(EPtr); + // reset host-task's lambda and quit + HostTask.MHostTask.reset(); + Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd); + return; + } } try { @@ -402,7 +412,7 @@ class DispatchHostTask { if (HostTask.MHostTask->isInteropTask()) { interop_handle IH{MReqToMem, HostTask.MQueue, HostTask.MQueue->getDeviceImplPtr(), - HostTask.MQueue->getContextImplPtr()}; + HostTask.MQueue->getContextImplPtr(), MEvent}; HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(), IH); } else @@ -1055,7 +1065,7 @@ void AllocaCommand::emitInstrumentationData() { pi_int32 AllocaCommand::enqueueImp() { waitForPreparedHostEvents(); - std::vector EventImpls = MPreparedDepsEvents; + std::vector EventImpls = getAllPreparedDepsEvents(); sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); @@ -1152,7 +1162,7 @@ void *AllocaSubBufCommand::getMemAllocation() const { pi_int32 AllocaSubBufCommand::enqueueImp() { waitForPreparedHostEvents(); - std::vector EventImpls = MPreparedDepsEvents; + std::vector EventImpls = getAllPreparedDepsEvents(); sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); MMemAllocation = MemoryManager::allocateMemSubBuffer( @@ -1221,7 +1231,7 @@ void ReleaseCommand::emitInstrumentationData() { pi_int32 ReleaseCommand::enqueueImp() { waitForPreparedHostEvents(); - std::vector EventImpls = MPreparedDepsEvents; + std::vector EventImpls = getAllPreparedDepsEvents(); std::vector RawEvents = getPiEvents(EventImpls); bool SkipRelease = false; @@ -1344,7 +1354,7 @@ void MapMemObject::emitInstrumentationData() { pi_int32 MapMemObject::enqueueImp() { waitForPreparedHostEvents(); - std::vector EventImpls = MPreparedDepsEvents; + std::vector EventImpls = getAllPreparedDepsEvents(); std::vector RawEvents = getPiEvents(EventImpls); flushCrossQueueDeps(EventImpls, getWorkerQueue()); @@ -1430,7 +1440,7 @@ bool UnMapMemObject::producesPiEvent() const { pi_int32 UnMapMemObject::enqueueImp() { waitForPreparedHostEvents(); - std::vector EventImpls = MPreparedDepsEvents; + std::vector EventImpls = getAllPreparedDepsEvents(); std::vector RawEvents = getPiEvents(EventImpls); flushCrossQueueDeps(EventImpls, getWorkerQueue()); @@ -1540,7 +1550,7 @@ bool MemCpyCommand::producesPiEvent() const { pi_int32 MemCpyCommand::enqueueImp() { waitForPreparedHostEvents(); - std::vector EventImpls = MPreparedDepsEvents; + std::vector EventImpls = getAllPreparedDepsEvents(); sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); @@ -1603,7 +1613,7 @@ void ExecCGCommand::clearAuxiliaryResources() { pi_int32 UpdateHostRequirementCommand::enqueueImp() { waitForPreparedHostEvents(); - std::vector EventImpls = MPreparedDepsEvents; + std::vector EventImpls = getAllPreparedDepsEvents(); sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); Command::waitForEvents(MQueue, EventImpls, Event); @@ -1697,7 +1707,7 @@ const ContextImplPtr &MemCpyCommandHost::getWorkerContext() const { pi_int32 MemCpyCommandHost::enqueueImp() { const QueueImplPtr &Queue = getWorkerQueue(); waitForPreparedHostEvents(); - std::vector EventImpls = MPreparedDepsEvents; + std::vector EventImpls = getAllPreparedDepsEvents(); std::vector RawEvents = getPiEvents(EventImpls); sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); @@ -1729,7 +1739,8 @@ EmptyCommand::EmptyCommand(QueueImplPtr Queue) pi_int32 EmptyCommand::enqueueImp() { waitForPreparedHostEvents(); - waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef()); + auto DepEvents = getAllPreparedDepsEvents(); + waitForEvents(MQueue, DepEvents, MEvent->getHandleRef()); return PI_SUCCESS; } @@ -2751,7 +2762,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { // Any device dependencies need to be waited on here since subsequent // submissions of the command buffer itself will not receive dependencies on // them, e.g. initial copies from host to device - std::vector EventImpls = MPreparedDepsEvents; + std::vector EventImpls = getAllPreparedDepsEvents(); flushCrossQueueDeps(EventImpls, getWorkerQueue()); std::vector RawEvents = getPiEvents(EventImpls); if (!RawEvents.empty()) { @@ -2906,7 +2917,7 @@ pi_int32 ExecCGCommand::enqueueImp() { pi_int32 ExecCGCommand::enqueueImpQueue() { if (getCG().getType() != CG::CGTYPE::CodeplayHostTask) waitForPreparedHostEvents(); - std::vector EventImpls = MPreparedDepsEvents; + std::vector EventImpls = getAllPreparedDepsEvents(); auto RawEvents = getPiEvents(EventImpls); flushCrossQueueDeps(EventImpls, getWorkerQueue()); @@ -3163,7 +3174,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { copySubmissionCodeLocation(); MQueue->getThreadPool().submit( - DispatchHostTask(this, std::move(ReqToMem))); + DispatchHostTask(this, std::move(ReqToMem), MEvent)); MShouldCompleteEventIfPossible = false; diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 8ba0cceee9e6a..eb77ec55f3e22 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -301,6 +301,18 @@ class Command { return MPreparedDepsEvents; } + // This will return the MPreparedDepsEvents as well as any + // HostTaskNativeEvents that an EventImplPtr might hold + const std::vector getAllPreparedDepsEvents() const { + std::vector RetEvents = MPreparedDepsEvents; + for (auto &DepEvent : MPreparedHostDepsEvents) + if (DepEvent->hasHostTaskNativeEvents()) + RetEvents.insert(RetEvents.end(), + DepEvent->getHostTaskNativeEvents().begin(), + DepEvent->getHostTaskNativeEvents().end()); + return RetEvents; + } + // XPTI instrumentation. Copy code location details to the internal struct. // Memory is allocated in this method and released in destructor. void copySubmissionCodeLocation(); diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index 9853b25d9d310..7dead84f63bb9 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -57,7 +57,12 @@ bool Scheduler::GraphProcessor::handleBlockingCmd(Command *Cmd, std::lock_guard Guard(Cmd->MBlockedUsersMutex); if (Cmd->isBlocking()) { const EventImplPtr &RootCmdEvent = RootCommand->getEvent(); - Cmd->addBlockedUserUnique(RootCmdEvent); + // Host tasks don't need to be added to wait list. When host tasks are + // enqueued, a new thread is created which waits on dep events via + // condition variables, so they don't need to be enqueued by other + // additional means + if (!RootCommand->isHostTask()) + Cmd->addBlockedUserUnique(RootCmdEvent); EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueBlocked, Cmd); // Blocked command will be enqueued asynchronously from submission so we diff --git a/sycl/source/event.cpp b/sycl/source/event.cpp index a7bae8055c10b..367cd7c52ac5e 100644 --- a/sycl/source/event.cpp +++ b/sycl/source/event.cpp @@ -127,8 +127,7 @@ backend event::get_backend() const noexcept { return getImplBackend(impl); } pi_native_handle event::getNative() const { return impl->getNative(); } std::vector event::getNativeVector() const { - std::vector ReturnVector = {impl->getNative()}; - return ReturnVector; + return impl->getNativeVector(); } } // namespace _V1 diff --git a/sycl/source/interop_handle.cpp b/sycl/source/interop_handle.cpp index cd479493bbae3..dfe36d78faeac 100644 --- a/sycl/source/interop_handle.cpp +++ b/sycl/source/interop_handle.cpp @@ -52,5 +52,49 @@ interop_handle::getNativeQueue(int32_t &NativeHandleDesc) const { return MQueue->getNative(NativeHandleDesc); } +void interop_handle::addNativeEvents( + std::vector &NativeEvents) { + auto Plugin = MQueue->getPlugin(); + + if (!MEvent->backendSet()) { + MEvent->setContextImpl(MContext); + } + + // Make a std::vector of PiEvents from the native events + for (auto i = 0u; i < NativeEvents.size(); ++i) { + detail::pi::PiEvent Ev; + Plugin->call( + NativeEvents[i], MContext->getHandleRef(), + /*OwnNativeHandle*/ true, &Ev); + auto EventImpl = std::make_shared( + Ev, detail::createSyclObjFromImpl(MContext)); + // TODO: Do I need to call things like: + // setStateIncomplete -> Not sure + // setSubmissionTime -> Not sure + // More...? + MEvent->addHostTaskNativeEvent(EventImpl); + } +} + +std::vector interop_handle::getNativeEvents() const { + // What if the events here have not yet been enqueued? I will need to wait on + // them. That is probably already done? + // + // Moreover what are the usual requirements of the host task launch? + // + // Do all dependent events need to be complete, or just enqueued? I suspect it + // is the former, and we want the latter in the case that we are using these + // entry points. We will maybe need a new host task entry point. + std::vector RetEvents; + for (auto &DepEvent : MEvent->getWaitList()) { + if (DepEvent->backendSet()) { + auto NativeEvents = DepEvent->getNativeVector(); + RetEvents.insert(RetEvents.end(), NativeEvents.begin(), + NativeEvents.end()); + } + } + return RetEvents; +} + } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/HostInteropTask/native-events/add-native-events-cuda.cpp b/sycl/test-e2e/HostInteropTask/native-events/add-native-events-cuda.cpp new file mode 100644 index 0000000000000..48a19fca6d5be --- /dev/null +++ b/sycl/test-e2e/HostInteropTask/native-events/add-native-events-cuda.cpp @@ -0,0 +1,139 @@ +// REQUIRES: cuda +// +// RUN: %{build} -o %t.out -lcuda +// RUN: %{run} %t.out + +// These tests use the add_native_events API to ensure that the SYCL RT can +// handle the events submitted to add_native_events within its runtime DAG. +// +// If manual_interop_sync is used then the user deals with async dependencies +// manually in the HT lambda through the get_native_events interface. + +#include "native-events-cuda.hpp" +#include "native-events.hpp" + +#include +#include + +using T = unsigned; // We don't need to test lots of types, we just want a race + // condition +constexpr size_t bufSize = 1e6; +constexpr T pattern = 42; + +// Tries to check for a race condition if the backend events are not added to +// the SYCL dag. +template void test1() { + printf("Running test 2\n"); + sycl::queue q; + std::vector out(bufSize, 0); + + T *ptrHost = sycl::malloc_host(bufSize, q); // malloc_host is necessary to + // make the memcpy as async as + // possible + + auto syclEvent = q.submit([&](sycl::handler &cgh) { + cgh.host_task([&](sycl::interop_handle ih) { + auto [stream, ev] = cudaSetCtxAndGetStreamAndEvent(ih); + CUdeviceptr cuPtr; + CUDA_CHECK(cuMemAlloc_v2(&cuPtr, bufSize * sizeof(T))); + CUDA_CHECK(cuMemsetD32Async(cuPtr, pattern, bufSize, stream)); + CUDA_CHECK( + cuMemcpyDtoHAsync(ptrHost, cuPtr, bufSize * sizeof(T), stream)); + + CUDA_CHECK(cuEventRecord(ev, stream)); + + ih.add_native_events({ev}); + }); + }); + waitHelper(syclEvent, q); + checkResults(ptrHost, bufSize, pattern); +} + +// Using host task event as a cgh.depends_on with USM +template void test2() { + printf("Running test 3\n"); + using T = unsigned; + + sycl::queue q; + std::vector out(bufSize, 0); + + T *ptrHostA = sycl::malloc_host(bufSize, q); + T *ptrHostB = sycl::malloc_host(bufSize, q); + + auto hostTaskEvent = q.submit([&](sycl::handler &cgh) { + cgh.host_task([&](sycl::interop_handle ih) { + auto [stream, ev] = cudaSetCtxAndGetStreamAndEvent(ih); + CUdeviceptr cuPtr; + CUDA_CHECK(cuMemAlloc_v2(&cuPtr, bufSize * sizeof(T))); + + CUDA_CHECK(cuMemsetD32Async(cuPtr, pattern, bufSize, stream)); + CUDA_CHECK( + cuMemcpyDtoHAsync(ptrHostA, cuPtr, bufSize * sizeof(T), stream)); + + CUDA_CHECK(cuEventRecord(ev, stream)); + + ih.add_native_events({ev}); + }); + }); + + auto syclEvent = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(hostTaskEvent); + cgh.memcpy(ptrHostB, ptrHostA, bufSize * sizeof(T)); + }); + + waitHelper(syclEvent, q); + checkResults(ptrHostB, bufSize, pattern); + printf("Tests passed\n"); +} + +// Using host task event with implicit DAG from buffer accessor model +template void test3() { + printf("Running test 4\n"); + using T = unsigned; + + sycl::queue q; + + T *ptrHostIn = sycl::malloc_host(bufSize, q); + T *ptrHostOut = sycl::malloc_host(bufSize, q); + + // Dummy buffer to create dependencies between commands. Use a host malloc + // for host ptr to make sure the buffer has pinned memory + sycl::buffer buf{ + ptrHostIn, bufSize, {sycl::property::buffer::use_host_ptr{}}}; + + q.submit([&](sycl::handler &cgh) { + sycl::accessor acc{buf, sycl::write_only}; + + cgh.host_task([&](sycl::interop_handle ih) { + // FIXME: this call fails + auto accPtr = ih.get_native_mem(acc); + auto [stream, ev] = cudaSetCtxAndGetStreamAndEvent(ih); + + CUDA_CHECK(cuMemsetD32Async(reinterpret_cast(accPtr), + pattern, bufSize, stream)); + CUDA_CHECK(cuEventRecord(ev, stream)); + + ih.add_native_events({ev}); + }); + }); + + { + sycl::host_accessor hostAcc{buf}; + for (auto i = 0; i < bufSize; ++i) + ptrHostOut[i] = hostAcc[i]; + } + + q.wait(); + checkResults(ptrHostOut, bufSize, pattern); + printf("Tests passed\n"); +} + +int main() { + test1(); + test1(); + test2(); + test2(); + // test3(); Fails with `SyclObject.impl && "every constructor + // should create an impl"' failed. + // test3(); +} diff --git a/sycl/test-e2e/HostInteropTask/native-events/get-native-events-cuda.cpp b/sycl/test-e2e/HostInteropTask/native-events/get-native-events-cuda.cpp new file mode 100644 index 0000000000000..798ffb69009fa --- /dev/null +++ b/sycl/test-e2e/HostInteropTask/native-events/get-native-events-cuda.cpp @@ -0,0 +1,187 @@ +// REQUIRES: cuda +// +// RUN: %{build} -o %t.out -lcuda +// RUN: %{run} %t.out + +// These tests use the get_native_events API together with manual_interop_sync +// property. If manual interop sync is not used then get_native_events is not +// necessary, since all events have been synchronized with already on host, +// before the HT lambda is launched. +// +// If manual_interop_sync is used then the user deals with async dependencies +// manually in the HT lambda through the get_native_events interface. +// + +#include "native-events-cuda.hpp" +#include "native-events.hpp" + +#include +#include + +using T = unsigned; // We don't need to test lots of types, we just want a race + // condition +constexpr size_t bufSize = 1e7; +constexpr T pattern = 42; + +sycl::queue q; + +// Check that the SYCL event that we submit with add_native_events can be +// retrieved later through get_native_events in a dependent host task +template struct test1 { + void operator()() { + printf("Running test 1\n"); + std::atomic atomicEvent; // To share the event from the host task + // with the main thread + + auto syclEvent1 = q.submit([&](sycl::handler &cgh) { + cgh.host_task([&](sycl::interop_handle ih) { + auto [_, ev] = cudaSetCtxAndGetStreamAndEvent(ih); + cuEventRecord(ev, 0); + atomicEvent.store(ev); + ih.add_native_events({ev}); + }); + }); + + // This task must wait on the other lambda to complete + auto syclEvent2 = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(syclEvent1); + cgh.host_task( + [&](sycl::interop_handle ih) { + auto nativeEvents = + ih.get_native_events(); + if constexpr (!UseManualInteropSync) { + // Events should be synchronized with by SYCL RT if + // manual_interop_sync not used + return; + } + assert(std::find(nativeEvents.begin(), nativeEvents.end(), + atomicEvent.load()) != nativeEvents.end()); + }, + PropList(UseManualInteropSync)); + }); + + waitHelper(syclEvent2, q); + } +}; + +// Tries to check for a race condition if the backend events are not added to +// the SYCL dag. +template struct test2 { + void operator()() { + printf("Running test 2\n"); + T *ptrHost = sycl::malloc_host( + bufSize, + q); // malloc_host is necessary to make the memcpy as async as possible + + auto syclEvent1 = q.submit([&](sycl::handler &cgh) { + cgh.host_task([&](sycl::interop_handle ih) { + auto [stream, ev] = cudaSetCtxAndGetStreamAndEvent(ih); + CUdeviceptr cuPtr; + CUDA_CHECK(cuMemAlloc_v2(&cuPtr, bufSize * sizeof(T))); + CUDA_CHECK(cuMemsetD32Async(cuPtr, pattern, bufSize, stream)); + CUDA_CHECK( + cuMemcpyDtoHAsync(ptrHost, cuPtr, bufSize * sizeof(T), stream)); + + CUDA_CHECK(cuEventRecord(ev, stream)); + + ih.add_native_events({ev}); + }); + }); + + auto syclEvent2 = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(syclEvent1); + cgh.host_task( + [&](sycl::interop_handle ih) { + cudaSetCtxAndGetStreamAndEvent(ih); + auto nativeEvents = + ih.get_native_events(); + if constexpr (!UseManualInteropSync) { + // Events should be synchronized with by SYCL RT if + // manual_interop_sync not used + return; + } + assert(nativeEvents.size()); + for (auto &cudaEv : nativeEvents) { + CUDA_CHECK(cuEventSynchronize(cudaEv)); + } + }, + PropList(UseManualInteropSync)); + }); + + waitHelper(syclEvent2, q); + for (auto i = 0; i < bufSize; ++i) { + if (ptrHost[i] != pattern) { + fprintf(stderr, "Wrong result at index: %d, have %d vs %d\n", i, + ptrHost[i], pattern); + throw; + } + } + } +}; + +// Using host task event as a cgh.depends_on with USM +template struct test3 { + void operator()() { + printf("Running test 3\n"); + using T = unsigned; + + T *ptrHostA = sycl::malloc_host(bufSize, q); + T *ptrHostB = sycl::malloc_host(bufSize, q); + + T *ptrDevice = sycl::malloc_device(bufSize, q); + + for (auto i = 0; i < bufSize; ++i) + ptrHostA[i] = pattern; + + auto syclEvent1 = q.submit([&](sycl::handler &cgh) { + cgh.memcpy(ptrDevice, ptrHostA, bufSize * sizeof(T)); + }); + + auto syclEvent2 = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(syclEvent1); + cgh.host_task( + [&](sycl::interop_handle ih) { + auto [stream, _] = cudaSetCtxAndGetStreamAndEvent(ih); + auto nativeEvents = + ih.get_native_events(); + if constexpr (UseManualInteropSync) { + assert(nativeEvents.size()); + for (auto &cudaEv : nativeEvents) { + CUDA_CHECK(cuStreamWaitEvent(stream, cudaEv, 0)); + } + } + + CUDA_CHECK(cuMemcpyDtoHAsync( + ptrHostB, reinterpret_cast(ptrDevice), + bufSize * sizeof(T), stream)); + CUDA_CHECK(cuStreamSynchronize(stream)); + }, + PropList(UseManualInteropSync)); + }); + + waitHelper(syclEvent2, q); + + for (auto i = 0; i < bufSize; --i) { + if (ptrHostB[i] != pattern) { + cuCtxSynchronize(); + fprintf(stderr, "Wrong result at index: %d, have %d vs %d\n", i, + ptrHostB[i], pattern); + throw; + } + } + } +}; + +template