Skip to content

Commit

Permalink
gpu: nvidia: amd: get native context through device (#1765)
Browse files Browse the repository at this point in the history
Co-authored-by: Denis Samoilov <[email protected]>
  • Loading branch information
hdelan and densamoilov authored Jan 26, 2024
1 parent 8f0f6f5 commit ba51695
Show file tree
Hide file tree
Showing 13 changed files with 116 additions and 51 deletions.
39 changes: 39 additions & 0 deletions src/gpu/amd/sycl_hip_compat.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "sycl_hip_compat.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace amd {
namespace compat {

template <>
HIPcontext get_native(const ::sycl::device &device) {
HIPdevice nativeDevice
= ::sycl::get_native<::sycl::backend::ext_oneapi_hip>(device);
HIPcontext nativeContext;
if (hipDevicePrimaryCtxRetain(&nativeContext, nativeDevice) != hipSuccess) {
throw std::runtime_error("Could not create a native context");
}
return nativeContext;
}
} // namespace compat
} // namespace amd
} // namespace gpu
} // namespace impl
} // namespace dnnl
7 changes: 6 additions & 1 deletion src/gpu/amd/sycl_hip_compat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,18 @@ void host_task(::sycl::handler &cgh, const T &task) {
cgh.host_task(task);
}

template <typename native_object_t, typename sycl_object_t>
template <typename native_object_t, typename sycl_object_t,
typename
= std::enable_if_t<!std::is_same_v<sycl_object_t, ::sycl::context>>>
native_object_t get_native(const sycl_object_t &sycl_object) {
auto handle
= ::sycl::get_native<::sycl::backend::ext_oneapi_hip>(sycl_object);
return reinterpret_cast<native_object_t>(handle);
}

template <>
HIPcontext get_native(const ::sycl::device &device);

} // namespace compat
} // namespace amd
} // namespace gpu
Expand Down
2 changes: 1 addition & 1 deletion src/gpu/amd/sycl_hip_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ status_t sycl_hip_engine_t::set_miopen_handle() {
return status::success;
}
hipCtx_t sycl_hip_engine_t::get_underlying_context() const {
return compat::get_native<hipCtx_t>(context());
return compat::get_native<hipCtx_t>(device());
}

hipDevice_t sycl_hip_engine_t::get_underlying_device() const {
Expand Down
8 changes: 4 additions & 4 deletions src/gpu/amd/sycl_hip_scoped_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,14 @@ hip_sycl_scoped_context_handler_t::hip_sycl_scoped_context_handler_t(
const sycl_hip_engine_t &engine)
: need_to_recover_(false) {
try {
auto desired = engine.get_underlying_context();
HIP_EXECUTE_FUNC(hipCtxGetCurrent, &original_);
auto desired = engine.get_underlying_context();
currentDevice_ = engine.get_underlying_device();

if (original_ != desired) {

HIP_EXECUTE_FUNC(hipCtxSetCurrent, desired);

need_to_recover_
= !(original_ == nullptr && engine.has_primary_context());
need_to_recover_ = original_ != nullptr;
}
} catch (const std::runtime_error &e) {
error::wrap_c_api(status::runtime_error, e.what());
Expand All @@ -51,6 +50,7 @@ hip_sycl_scoped_context_handler_t::
~hip_sycl_scoped_context_handler_t() noexcept(false) {

try {
HIP_EXECUTE_FUNC(hipDevicePrimaryCtxRelease, currentDevice_);
if (need_to_recover_) { HIP_EXECUTE_FUNC(hipCtxSetCurrent, original_); }
} catch (const std::runtime_error &e) {
error::wrap_c_api(status::runtime_error, e.what());
Expand Down
1 change: 1 addition & 0 deletions src/gpu/amd/sycl_hip_scoped_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ namespace amd {

class hip_sycl_scoped_context_handler_t {
hipCtx_t original_;
hipDevice_t currentDevice_;
bool need_to_recover_;

public:
Expand Down
7 changes: 6 additions & 1 deletion src/gpu/amd/sycl_hip_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ HIPstream sycl_hip_stream_t::get_underlying_stream() {

// the sycl_hip_stream_t will not own this. it is an observer pointer
HIPcontext sycl_hip_stream_t::get_underlying_context() {
return compat::get_native<HIPcontext>(queue_->get_context());
return compat::get_native<HIPcontext>(queue_->get_device());
}

// the sycl_hip_stream_t will not own this. it is an observer pointer
Expand Down Expand Up @@ -94,6 +94,11 @@ status_t sycl_hip_stream_t::init() {
|| (engine_context != queue_context))
? status::invalid_arguments
: status::success;

// We don't want to keep a reference to engine_context, which is
// retained in get_underlying_context
HIP_EXECUTE_FUNC(hipDevicePrimaryCtxRelease, engine_device);
HIP_EXECUTE_FUNC(hipDevicePrimaryCtxRelease, queue_device);
}

return status;
Expand Down
41 changes: 41 additions & 0 deletions src/gpu/nvidia/sycl_cuda_compat.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "sycl_cuda_compat.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
namespace compat {

template <>
CUcontext get_native(const ::sycl::device &device) {
CUdevice nativeDevice
= ::sycl::get_native<::sycl::backend::ext_oneapi_cuda>(device);
CUcontext nativeContext;
if (cuDevicePrimaryCtxRetain(&nativeContext, nativeDevice)
!= CUDA_SUCCESS) {
throw std::runtime_error("Could not create a native context");
}
return nativeContext;
}

} // namespace compat
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
7 changes: 6 additions & 1 deletion src/gpu/nvidia/sycl_cuda_compat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,18 @@ void host_task(::sycl::handler &cgh, const T &task) {
cgh.host_task(task);
}

template <typename native_object_t, typename sycl_object_t>
template <typename native_object_t, typename sycl_object_t,
typename
= std::enable_if_t<!std::is_same_v<sycl_object_t, ::sycl::context>>>
native_object_t get_native(const sycl_object_t &sycl_object) {
auto handle
= ::sycl::get_native<::sycl::backend::ext_oneapi_cuda>(sycl_object);
return reinterpret_cast<native_object_t>(handle);
}

template <>
CUcontext get_native(const ::sycl::device &device);

} // namespace compat
} // namespace nvidia
} // namespace gpu
Expand Down
31 changes: 1 addition & 30 deletions src/gpu/nvidia/sycl_cuda_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ status_t cuda_engine_create(engine_t **engine, engine_kind_t engine_kind,
sycl_cuda_engine_t::sycl_cuda_engine_t(engine_kind_t kind,
const ::sycl::device &dev, const ::sycl::context &ctx, size_t index)
: base_t(kind, dev, ctx, index) {
underlying_context_type();
set_cudnn_handle();
set_cublas_handle();
}
Expand Down Expand Up @@ -121,7 +120,7 @@ status_t sycl_cuda_engine_t::set_cudnn_handle() {
}

CUcontext sycl_cuda_engine_t::get_underlying_context() const {
return compat::get_native<CUcontext>(context());
return compat::get_native<CUcontext>(device());
}

CUdevice sycl_cuda_engine_t::get_underlying_device() const {
Expand All @@ -137,34 +136,6 @@ status_t sycl_cuda_engine_t::create_stream(
return sycl_cuda_stream_t::create_stream(stream, this, queue);
}

status_t sycl_cuda_engine_t::underlying_context_type() {
// this is a costly function which take avarage up to 75ms
// on titanrx. So we must run it once and store the variable
// in primary_context_;
CUcontext primary, current;
CUcontext desired = compat::get_native<CUcontext>(context());
CUdevice cuda_device = compat::get_native<CUdevice>(device());
CHECK(CUDA_EXECUTE_FUNC_S(cuCtxGetCurrent, &current));

unsigned int flags;
int is_primary_active;
CHECK(CUDA_EXECUTE_FUNC_S(cuDevicePrimaryCtxGetState, cuda_device, &flags,
&is_primary_active));

// If primary context is active, current context will be the primary context
// So we can do the comparison without the expensive calls to CtxRetain and CtxRelease
if (current == desired || is_primary_active) {
primary_context_
= (current == desired) ? (is_primary_active == 1) : false;
} else {
CHECK(CUDA_EXECUTE_FUNC_S(
cuDevicePrimaryCtxRetain, &primary, cuda_device));
CHECK(CUDA_EXECUTE_FUNC_S(cuDevicePrimaryCtxRelease, cuda_device));
primary_context_ = (primary == desired);
}
return status::success;
}

cudnnHandle_t *sycl_cuda_engine_t::get_cudnn_handle() {
if (!cudnn_handle_.is_set()) set_cudnn_handle();
return cudnn_handle_.get().get();
Expand Down
9 changes: 0 additions & 9 deletions src/gpu/nvidia/sycl_cuda_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,9 @@ class sycl_cuda_engine_t : public dnnl::impl::sycl::sycl_engine_base_t {
CUdevice get_underlying_device() const;
cudnnHandle_t *get_cudnn_handle();
cublasHandle_t *get_cublas_handle();
const bool has_primary_context() const { return primary_context_; }
device_id_t device_id() const override;

protected:
~sycl_cuda_engine_t() override = default;

private:
// This functions sets the context type. Since cuda requires different
// approach in retaining/releasing primary/non-primary context.
status_t underlying_context_type();
status_t set_cudnn_handle();
status_t set_cublas_handle();
// To avoid performance penalty cudnn/cublas required to have one handle per
Expand All @@ -105,8 +98,6 @@ class sycl_cuda_engine_t : public dnnl::impl::sycl::sycl_engine_base_t {
utils::thread_local_storage_t<
std::unique_ptr<cublasHandle_t, void (*)(cublasHandle_t *)>>
cublas_handle_;

bool primary_context_;
};

} // namespace nvidia
Expand Down
7 changes: 4 additions & 3 deletions src/gpu/nvidia/sycl_cuda_scoped_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ cuda_sycl_scoped_context_handler_t::cuda_sycl_scoped_context_handler_t(
const sycl_cuda_engine_t &engine)
: need_to_recover_(false) {
try {
auto desired = engine.get_underlying_context();
CUDA_EXECUTE_FUNC(cuCtxGetCurrent, &original_);
auto desired = engine.get_underlying_context();
currentDevice_ = engine.get_underlying_device();

if (original_ != desired) {
// Sets the desired context as the active one for the thread
Expand All @@ -38,8 +39,7 @@ cuda_sycl_scoped_context_handler_t::cuda_sycl_scoped_context_handler_t(
// the same underlying CUDA primary context are destroyed. This
// emulates the behaviour of the CUDA runtime api, and avoids costly
// context switches. No action is required on this side of the if.
need_to_recover_
= !(original_ == nullptr && engine.has_primary_context());
need_to_recover_ = original_ != nullptr;
}
} catch (const std::runtime_error &e) {
error::wrap_c_api(status::runtime_error, e.what());
Expand All @@ -51,6 +51,7 @@ cuda_sycl_scoped_context_handler_t::
// we need to release the placed_context_ since we set it from
// ctx.get() retains the underlying context so we need to remove it
try {
CUDA_EXECUTE_FUNC(cuDevicePrimaryCtxRelease, currentDevice_);
if (need_to_recover_) { CUDA_EXECUTE_FUNC(cuCtxSetCurrent, original_); }
} catch (const std::runtime_error &e) {
error::wrap_c_api(status::runtime_error, e.what());
Expand Down
1 change: 1 addition & 0 deletions src/gpu/nvidia/sycl_cuda_scoped_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ namespace nvidia {
// required to put the stream context on top of the cuda stack
class cuda_sycl_scoped_context_handler_t {
CUcontext original_;
CUdevice currentDevice_;
bool need_to_recover_;

public:
Expand Down
7 changes: 6 additions & 1 deletion src/gpu/nvidia/sycl_cuda_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ CUstream sycl_cuda_stream_t::get_underlying_stream() {

// the sycl_cuda_stream_t will not own this. it is an observer pointer
CUcontext sycl_cuda_stream_t::get_underlying_context() {
return compat::get_native<CUcontext>(queue_->get_context());
return compat::get_native<CUcontext>(queue_->get_device());
}

// the sycl_cuda_stream_t will not own this. it is an observer pointer
Expand Down Expand Up @@ -93,6 +93,11 @@ status_t sycl_cuda_stream_t::init() {
|| (engine_context != queue_context))
? status::invalid_arguments
: status::success;

// We don't want to keep a reference to engine_context, which is
// retained in get_underlying_context
CUDA_EXECUTE_FUNC(cuDevicePrimaryCtxRelease_v2, engine_device);
CUDA_EXECUTE_FUNC(cuDevicePrimaryCtxRelease_v2, queue_device);
}

return status;
Expand Down

0 comments on commit ba51695

Please sign in to comment.