Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gpu: nvidia: amd: Get native context through device #1765

Merged
merged 14 commits into from
Jan 26, 2024
39 changes: 39 additions & 0 deletions src/gpu/amd/sycl_hip_compat.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "sycl_hip_compat.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace amd {
namespace compat {

template <>
HIPcontext get_native(const ::sycl::device &device) {
HIPdevice nativeDevice
= ::sycl::get_native<::sycl::backend::ext_oneapi_hip>(device);
HIPcontext nativeContext;
if (hipDevicePrimaryCtxRetain(&nativeContext, nativeDevice) != hipSuccess) {
throw std::runtime_error("Could not create a native context");
}
return nativeContext;
}
} // namespace compat
} // namespace amd
} // namespace gpu
} // namespace impl
} // namespace dnnl
7 changes: 6 additions & 1 deletion src/gpu/amd/sycl_hip_compat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,18 @@ void host_task(::sycl::handler &cgh, const T &task) {
cgh.host_task(task);
}

template <typename native_object_t, typename sycl_object_t>
template <typename native_object_t, typename sycl_object_t,
typename
= std::enable_if_t<!std::is_same_v<sycl_object_t, ::sycl::context>>>
native_object_t get_native(const sycl_object_t &sycl_object) {
auto handle
= ::sycl::get_native<::sycl::backend::ext_oneapi_hip>(sycl_object);
return reinterpret_cast<native_object_t>(handle);
}

template <>
HIPcontext get_native(const ::sycl::device &device);

} // namespace compat
} // namespace amd
} // namespace gpu
Expand Down
2 changes: 1 addition & 1 deletion src/gpu/amd/sycl_hip_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ status_t sycl_hip_engine_t::set_miopen_handle() {
return status::success;
}
hipCtx_t sycl_hip_engine_t::get_underlying_context() const {
return compat::get_native<hipCtx_t>(context());
return compat::get_native<hipCtx_t>(device());
}

hipDevice_t sycl_hip_engine_t::get_underlying_device() const {
Expand Down
8 changes: 4 additions & 4 deletions src/gpu/amd/sycl_hip_scoped_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,14 @@ hip_sycl_scoped_context_handler_t::hip_sycl_scoped_context_handler_t(
const sycl_hip_engine_t &engine)
: need_to_recover_(false) {
try {
auto desired = engine.get_underlying_context();
HIP_EXECUTE_FUNC(hipCtxGetCurrent, &original_);
auto desired = engine.get_underlying_context();
currentDevice_ = engine.get_underlying_device();

if (original_ != desired) {

HIP_EXECUTE_FUNC(hipCtxSetCurrent, desired);

need_to_recover_
= !(original_ == nullptr && engine.has_primary_context());
need_to_recover_ = original_ != nullptr;
}
} catch (const std::runtime_error &e) {
error::wrap_c_api(status::runtime_error, e.what());
Expand All @@ -51,6 +50,7 @@ hip_sycl_scoped_context_handler_t::
~hip_sycl_scoped_context_handler_t() noexcept(false) {

try {
HIP_EXECUTE_FUNC(hipDevicePrimaryCtxRelease, currentDevice_);
if (need_to_recover_) { HIP_EXECUTE_FUNC(hipCtxSetCurrent, original_); }
} catch (const std::runtime_error &e) {
error::wrap_c_api(status::runtime_error, e.what());
Expand Down
1 change: 1 addition & 0 deletions src/gpu/amd/sycl_hip_scoped_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ namespace amd {

class hip_sycl_scoped_context_handler_t {
hipCtx_t original_;
hipDevice_t currentDevice_;
bool need_to_recover_;

public:
Expand Down
7 changes: 6 additions & 1 deletion src/gpu/amd/sycl_hip_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ HIPstream sycl_hip_stream_t::get_underlying_stream() {

// the sycl_hip_stream_t will not own this. it is an observer pointer
HIPcontext sycl_hip_stream_t::get_underlying_context() {
return compat::get_native<HIPcontext>(queue_->get_context());
return compat::get_native<HIPcontext>(queue_->get_device());
}

// the sycl_hip_stream_t will not own this. it is an observer pointer
Expand Down Expand Up @@ -94,6 +94,11 @@ status_t sycl_hip_stream_t::init() {
|| (engine_context != queue_context))
? status::invalid_arguments
: status::success;

// We don't want to keep a reference to engine_context, which is
// retained in get_underlying_context
HIP_EXECUTE_FUNC(hipDevicePrimaryCtxRelease, engine_device);
HIP_EXECUTE_FUNC(hipDevicePrimaryCtxRelease, queue_device);
}

return status;
Expand Down
41 changes: 41 additions & 0 deletions src/gpu/nvidia/sycl_cuda_compat.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "sycl_cuda_compat.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace nvidia {
namespace compat {

template <>
CUcontext get_native(const ::sycl::device &device) {
CUdevice nativeDevice
= ::sycl::get_native<::sycl::backend::ext_oneapi_cuda>(device);
CUcontext nativeContext;
if (cuDevicePrimaryCtxRetain(&nativeContext, nativeDevice)
!= CUDA_SUCCESS) {
throw std::runtime_error("Could not create a native context");
}
return nativeContext;
}

} // namespace compat
} // namespace nvidia
} // namespace gpu
} // namespace impl
} // namespace dnnl
7 changes: 6 additions & 1 deletion src/gpu/nvidia/sycl_cuda_compat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,18 @@ void host_task(::sycl::handler &cgh, const T &task) {
cgh.host_task(task);
}

template <typename native_object_t, typename sycl_object_t>
template <typename native_object_t, typename sycl_object_t,
typename
= std::enable_if_t<!std::is_same_v<sycl_object_t, ::sycl::context>>>
native_object_t get_native(const sycl_object_t &sycl_object) {
auto handle
= ::sycl::get_native<::sycl::backend::ext_oneapi_cuda>(sycl_object);
return reinterpret_cast<native_object_t>(handle);
}

template <>
CUcontext get_native(const ::sycl::device &device);

} // namespace compat
} // namespace nvidia
} // namespace gpu
Expand Down
31 changes: 1 addition & 30 deletions src/gpu/nvidia/sycl_cuda_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ status_t cuda_engine_create(engine_t **engine, engine_kind_t engine_kind,
sycl_cuda_engine_t::sycl_cuda_engine_t(engine_kind_t kind,
const ::sycl::device &dev, const ::sycl::context &ctx, size_t index)
: base_t(kind, dev, ctx, index) {
underlying_context_type();
set_cudnn_handle();
set_cublas_handle();
}
Expand Down Expand Up @@ -121,7 +120,7 @@ status_t sycl_cuda_engine_t::set_cudnn_handle() {
}

CUcontext sycl_cuda_engine_t::get_underlying_context() const {
return compat::get_native<CUcontext>(context());
return compat::get_native<CUcontext>(device());
}

CUdevice sycl_cuda_engine_t::get_underlying_device() const {
Expand All @@ -137,34 +136,6 @@ status_t sycl_cuda_engine_t::create_stream(
return sycl_cuda_stream_t::create_stream(stream, this, queue);
}

status_t sycl_cuda_engine_t::underlying_context_type() {
// this is a costly function which take avarage up to 75ms
// on titanrx. So we must run it once and store the variable
// in primary_context_;
CUcontext primary, current;
CUcontext desired = compat::get_native<CUcontext>(context());
CUdevice cuda_device = compat::get_native<CUdevice>(device());
CHECK(CUDA_EXECUTE_FUNC_S(cuCtxGetCurrent, &current));

unsigned int flags;
int is_primary_active;
CHECK(CUDA_EXECUTE_FUNC_S(cuDevicePrimaryCtxGetState, cuda_device, &flags,
&is_primary_active));

// If primary context is active, current context will be the primary context
// So we can do the comparison without the expensive calls to CtxRetain and CtxRelease
if (current == desired || is_primary_active) {
primary_context_
= (current == desired) ? (is_primary_active == 1) : false;
} else {
CHECK(CUDA_EXECUTE_FUNC_S(
cuDevicePrimaryCtxRetain, &primary, cuda_device));
CHECK(CUDA_EXECUTE_FUNC_S(cuDevicePrimaryCtxRelease, cuda_device));
primary_context_ = (primary == desired);
}
return status::success;
}

cudnnHandle_t *sycl_cuda_engine_t::get_cudnn_handle() {
if (!cudnn_handle_.is_set()) set_cudnn_handle();
return cudnn_handle_.get().get();
Expand Down
9 changes: 0 additions & 9 deletions src/gpu/nvidia/sycl_cuda_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,9 @@ class sycl_cuda_engine_t : public dnnl::impl::sycl::sycl_engine_base_t {
CUdevice get_underlying_device() const;
cudnnHandle_t *get_cudnn_handle();
cublasHandle_t *get_cublas_handle();
const bool has_primary_context() const { return primary_context_; }
device_id_t device_id() const override;

protected:
~sycl_cuda_engine_t() override = default;

private:
// This functions sets the context type. Since cuda requires different
// approach in retaining/releasing primary/non-primary context.
status_t underlying_context_type();
status_t set_cudnn_handle();
status_t set_cublas_handle();
// To avoid performance penalty cudnn/cublas required to have one handle per
Expand All @@ -105,8 +98,6 @@ class sycl_cuda_engine_t : public dnnl::impl::sycl::sycl_engine_base_t {
utils::thread_local_storage_t<
std::unique_ptr<cublasHandle_t, void (*)(cublasHandle_t *)>>
cublas_handle_;

bool primary_context_;
};

} // namespace nvidia
Expand Down
7 changes: 4 additions & 3 deletions src/gpu/nvidia/sycl_cuda_scoped_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ cuda_sycl_scoped_context_handler_t::cuda_sycl_scoped_context_handler_t(
const sycl_cuda_engine_t &engine)
: need_to_recover_(false) {
try {
auto desired = engine.get_underlying_context();
CUDA_EXECUTE_FUNC(cuCtxGetCurrent, &original_);
auto desired = engine.get_underlying_context();
currentDevice_ = engine.get_underlying_device();

if (original_ != desired) {
// Sets the desired context as the active one for the thread
Expand All @@ -38,8 +39,7 @@ cuda_sycl_scoped_context_handler_t::cuda_sycl_scoped_context_handler_t(
// the same underlying CUDA primary context are destroyed. This
// emulates the behaviour of the CUDA runtime api, and avoids costly
// context switches. No action is required on this side of the if.
need_to_recover_
= !(original_ == nullptr && engine.has_primary_context());
need_to_recover_ = original_ != nullptr;
}
} catch (const std::runtime_error &e) {
error::wrap_c_api(status::runtime_error, e.what());
Expand All @@ -51,6 +51,7 @@ cuda_sycl_scoped_context_handler_t::
// we need to release the placed_context_ since we set it from
// ctx.get() retains the underlying context so we need to remove it
try {
CUDA_EXECUTE_FUNC(cuDevicePrimaryCtxRelease, currentDevice_);
if (need_to_recover_) { CUDA_EXECUTE_FUNC(cuCtxSetCurrent, original_); }
} catch (const std::runtime_error &e) {
error::wrap_c_api(status::runtime_error, e.what());
Expand Down
1 change: 1 addition & 0 deletions src/gpu/nvidia/sycl_cuda_scoped_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ namespace nvidia {
// required to put the stream context on top of the cuda stack
class cuda_sycl_scoped_context_handler_t {
CUcontext original_;
CUdevice currentDevice_;
bool need_to_recover_;

public:
Expand Down
7 changes: 6 additions & 1 deletion src/gpu/nvidia/sycl_cuda_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ CUstream sycl_cuda_stream_t::get_underlying_stream() {

// the sycl_cuda_stream_t will not own this. it is an observer pointer
CUcontext sycl_cuda_stream_t::get_underlying_context() {
return compat::get_native<CUcontext>(queue_->get_context());
return compat::get_native<CUcontext>(queue_->get_device());
}

// the sycl_cuda_stream_t will not own this. it is an observer pointer
Expand Down Expand Up @@ -93,6 +93,11 @@ status_t sycl_cuda_stream_t::init() {
|| (engine_context != queue_context))
? status::invalid_arguments
: status::success;

// We don't want to keep a reference to engine_context, which is
// retained in get_underlying_context
CUDA_EXECUTE_FUNC(cuDevicePrimaryCtxRelease_v2, engine_device);
CUDA_EXECUTE_FUNC(cuDevicePrimaryCtxRelease_v2, queue_device);
}

return status;
Expand Down