oneapi-src · densamoilov · Jan 26, 2024 · Dec 5, 2023 · Jan 10, 2024 · Jan 12, 2024
@@ -0,0 +1,39 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "sycl_hip_compat.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace amd {
+namespace compat {
+
+template <>
+HIPcontext get_native(const ::sycl::device &device) {
+    HIPdevice nativeDevice
+            = ::sycl::get_native<::sycl::backend::ext_oneapi_hip>(device);
+    HIPcontext nativeContext;
+    if (hipDevicePrimaryCtxRetain(&nativeContext, nativeDevice) != hipSuccess) {
+        throw std::runtime_error("Could not create a native context");
+    }
+    return nativeContext;
+}
+} // namespace compat
+} // namespace amd
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
@@ -45,13 +45,18 @@ void host_task(::sycl::handler &cgh, const T &task) {
     cgh.host_task(task);
 }
 
-template <typename native_object_t, typename sycl_object_t>
+template <typename native_object_t, typename sycl_object_t,
+        typename
+        = std::enable_if_t<!std::is_same_v<sycl_object_t, ::sycl::context>>>
 native_object_t get_native(const sycl_object_t &sycl_object) {
     auto handle
             = ::sycl::get_native<::sycl::backend::ext_oneapi_hip>(sycl_object);
     return reinterpret_cast<native_object_t>(handle);
 }
 
+template <>
+HIPcontext get_native(const ::sycl::device &device);
+
 } // namespace compat
 } // namespace amd
 } // namespace gpu

@@ -109,7 +109,7 @@ status_t sycl_hip_engine_t::set_miopen_handle() {
     return status::success;
 }
 hipCtx_t sycl_hip_engine_t::get_underlying_context() const {
-    return compat::get_native<hipCtx_t>(context());
+    return compat::get_native<hipCtx_t>(device());
 }
 
 hipDevice_t sycl_hip_engine_t::get_underlying_device() const {

@@ -32,15 +32,14 @@ hip_sycl_scoped_context_handler_t::hip_sycl_scoped_context_handler_t(
         const sycl_hip_engine_t &engine)
     : need_to_recover_(false) {
     try {
-        auto desired = engine.get_underlying_context();
         HIP_EXECUTE_FUNC(hipCtxGetCurrent, &original_);
+        auto desired = engine.get_underlying_context();
+        currentDevice_ = engine.get_underlying_device();
 
         if (original_ != desired) {
 
             HIP_EXECUTE_FUNC(hipCtxSetCurrent, desired);
-
-            need_to_recover_
-                    = !(original_ == nullptr && engine.has_primary_context());
+            need_to_recover_ = original_ != nullptr;
         }
     } catch (const std::runtime_error &e) {
         error::wrap_c_api(status::runtime_error, e.what());
@@ -51,6 +50,7 @@ hip_sycl_scoped_context_handler_t::
         ~hip_sycl_scoped_context_handler_t() noexcept(false) {
 
     try {
+        HIP_EXECUTE_FUNC(hipDevicePrimaryCtxRelease, currentDevice_);
         if (need_to_recover_) { HIP_EXECUTE_FUNC(hipCtxSetCurrent, original_); }
     } catch (const std::runtime_error &e) {
         error::wrap_c_api(status::runtime_error, e.what());

@@ -31,6 +31,7 @@ namespace amd {
 
 class hip_sycl_scoped_context_handler_t {
     hipCtx_t original_;
+    hipDevice_t currentDevice_;
     bool need_to_recover_;
 
 public:

@@ -48,7 +48,7 @@ HIPstream sycl_hip_stream_t::get_underlying_stream() {
 
 // the sycl_hip_stream_t will not own this. it is an observer pointer
 HIPcontext sycl_hip_stream_t::get_underlying_context() {
-    return compat::get_native<HIPcontext>(queue_->get_context());
+    return compat::get_native<HIPcontext>(queue_->get_device());
 }
 
 // the sycl_hip_stream_t will not own this. it is an observer pointer
@@ -94,6 +94,11 @@ status_t sycl_hip_stream_t::init() {
                          || (engine_context != queue_context))
                 ? status::invalid_arguments
                 : status::success;
+
+        // We don't want to keep a reference to engine_context, which is
+        // retained in get_underlying_context
+        HIP_EXECUTE_FUNC(hipDevicePrimaryCtxRelease, engine_device);
+        HIP_EXECUTE_FUNC(hipDevicePrimaryCtxRelease, queue_device);
     }
 
     return status;

@@ -0,0 +1,41 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "sycl_cuda_compat.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace nvidia {
+namespace compat {
+
+template <>
+CUcontext get_native(const ::sycl::device &device) {
+    CUdevice nativeDevice
+            = ::sycl::get_native<::sycl::backend::ext_oneapi_cuda>(device);
+    CUcontext nativeContext;
+    if (cuDevicePrimaryCtxRetain(&nativeContext, nativeDevice)
+            != CUDA_SUCCESS) {
+        throw std::runtime_error("Could not create a native context");
+    }
+    return nativeContext;
+}
+
+} // namespace compat
+} // namespace nvidia
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
@@ -40,13 +40,18 @@ void host_task(::sycl::handler &cgh, const T &task) {
     cgh.host_task(task);
 }
 
-template <typename native_object_t, typename sycl_object_t>
+template <typename native_object_t, typename sycl_object_t,
+        typename
+        = std::enable_if_t<!std::is_same_v<sycl_object_t, ::sycl::context>>>
 native_object_t get_native(const sycl_object_t &sycl_object) {
     auto handle
             = ::sycl::get_native<::sycl::backend::ext_oneapi_cuda>(sycl_object);
     return reinterpret_cast<native_object_t>(handle);
 }
 
+template <>
+CUcontext get_native(const ::sycl::device &device);
+
 } // namespace compat
 } // namespace nvidia
 } // namespace gpu

@@ -77,7 +77,6 @@ status_t cuda_engine_create(engine_t **engine, engine_kind_t engine_kind,
 sycl_cuda_engine_t::sycl_cuda_engine_t(engine_kind_t kind,
         const ::sycl::device &dev, const ::sycl::context &ctx, size_t index)
     : base_t(kind, dev, ctx, index) {
-    underlying_context_type();
     set_cudnn_handle();
     set_cublas_handle();
 }
@@ -121,7 +120,7 @@ status_t sycl_cuda_engine_t::set_cudnn_handle() {
 }
 
 CUcontext sycl_cuda_engine_t::get_underlying_context() const {
-    return compat::get_native<CUcontext>(context());
+    return compat::get_native<CUcontext>(device());
 }
 
 CUdevice sycl_cuda_engine_t::get_underlying_device() const {
@@ -137,34 +136,6 @@ status_t sycl_cuda_engine_t::create_stream(
     return sycl_cuda_stream_t::create_stream(stream, this, queue);
 }
 
-status_t sycl_cuda_engine_t::underlying_context_type() {
-    // this is a costly function which take avarage up to 75ms
-    // on titanrx. So we must run it once and store the variable
-    // in primary_context_;
-    CUcontext primary, current;
-    CUcontext desired = compat::get_native<CUcontext>(context());
-    CUdevice cuda_device = compat::get_native<CUdevice>(device());
-    CHECK(CUDA_EXECUTE_FUNC_S(cuCtxGetCurrent, &current));
-
-    unsigned int flags;
-    int is_primary_active;
-    CHECK(CUDA_EXECUTE_FUNC_S(cuDevicePrimaryCtxGetState, cuda_device, &flags,
-            &is_primary_active));
-
-    // If primary context is active, current context will be the primary context
-    // So we can do the comparison without the expensive calls to CtxRetain and CtxRelease
-    if (current == desired || is_primary_active) {
-        primary_context_
-                = (current == desired) ? (is_primary_active == 1) : false;
-    } else {
-        CHECK(CUDA_EXECUTE_FUNC_S(
-                cuDevicePrimaryCtxRetain, &primary, cuda_device));
-        CHECK(CUDA_EXECUTE_FUNC_S(cuDevicePrimaryCtxRelease, cuda_device));
-        primary_context_ = (primary == desired);
-    }
-    return status::success;
-}
-
 cudnnHandle_t *sycl_cuda_engine_t::get_cudnn_handle() {
     if (!cudnn_handle_.is_set()) set_cudnn_handle();
     return cudnn_handle_.get().get();

@@ -78,16 +78,9 @@ class sycl_cuda_engine_t : public dnnl::impl::sycl::sycl_engine_base_t {
     CUdevice get_underlying_device() const;
     cudnnHandle_t *get_cudnn_handle();
     cublasHandle_t *get_cublas_handle();
-    const bool has_primary_context() const { return primary_context_; }
     device_id_t device_id() const override;
 
-protected:
-    ~sycl_cuda_engine_t() override = default;
-
 private:
-    // This functions sets the context type. Since cuda requires different
-    // approach in retaining/releasing primary/non-primary context.
-    status_t underlying_context_type();
     status_t set_cudnn_handle();
     status_t set_cublas_handle();
     // To avoid performance penalty cudnn/cublas required to have one handle per
@@ -105,8 +98,6 @@ class sycl_cuda_engine_t : public dnnl::impl::sycl::sycl_engine_base_t {
     utils::thread_local_storage_t<
             std::unique_ptr<cublasHandle_t, void (*)(cublasHandle_t *)>>
             cublas_handle_;
-
-    bool primary_context_;
 };
 
 } // namespace nvidia

@@ -26,8 +26,9 @@ cuda_sycl_scoped_context_handler_t::cuda_sycl_scoped_context_handler_t(
         const sycl_cuda_engine_t &engine)
     : need_to_recover_(false) {
     try {
-        auto desired = engine.get_underlying_context();
         CUDA_EXECUTE_FUNC(cuCtxGetCurrent, &original_);
+        auto desired = engine.get_underlying_context();
+        currentDevice_ = engine.get_underlying_device();
 
         if (original_ != desired) {
             // Sets the desired context as the active one for the thread
@@ -38,8 +39,7 @@ cuda_sycl_scoped_context_handler_t::cuda_sycl_scoped_context_handler_t(
             // the same underlying CUDA primary context are destroyed. This
             // emulates the behaviour of the CUDA runtime api, and avoids costly
             // context switches. No action is required on this side of the if.
-            need_to_recover_
-                    = !(original_ == nullptr && engine.has_primary_context());
+            need_to_recover_ = original_ != nullptr;
         }
     } catch (const std::runtime_error &e) {
         error::wrap_c_api(status::runtime_error, e.what());
@@ -51,6 +51,7 @@ cuda_sycl_scoped_context_handler_t::
     // we need to release the placed_context_ since we set it from
     // ctx.get() retains the underlying context so we need to remove it
     try {
+        CUDA_EXECUTE_FUNC(cuDevicePrimaryCtxRelease, currentDevice_);
         if (need_to_recover_) { CUDA_EXECUTE_FUNC(cuCtxSetCurrent, original_); }
     } catch (const std::runtime_error &e) {
         error::wrap_c_api(status::runtime_error, e.what());

@@ -34,6 +34,7 @@ namespace nvidia {
 // required to put the stream context on top of the cuda stack
 class cuda_sycl_scoped_context_handler_t {
     CUcontext original_;
+    CUdevice currentDevice_;
     bool need_to_recover_;
 
 public:

@@ -49,7 +49,7 @@ CUstream sycl_cuda_stream_t::get_underlying_stream() {
 
 // the sycl_cuda_stream_t will not own this. it is an observer pointer
 CUcontext sycl_cuda_stream_t::get_underlying_context() {
-    return compat::get_native<CUcontext>(queue_->get_context());
+    return compat::get_native<CUcontext>(queue_->get_device());
 }
 
 // the sycl_cuda_stream_t will not own this. it is an observer pointer
@@ -93,6 +93,11 @@ status_t sycl_cuda_stream_t::init() {
                          || (engine_context != queue_context))
                 ? status::invalid_arguments
                 : status::success;
+
+        // We don't want to keep a reference to engine_context, which is
+        // retained in get_underlying_context
+        CUDA_EXECUTE_FUNC(cuDevicePrimaryCtxRelease_v2, engine_device);
+        CUDA_EXECUTE_FUNC(cuDevicePrimaryCtxRelease_v2, queue_device);
     }
 
     return status;