Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 28 additions & 25 deletions csrc/driver_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,31 +61,34 @@
//
// Driver APIs are loaded using cudaGetDriverEntryPoint as recommended by
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#using-the-runtime-api
#define DEFINE_DRIVER_API_WRAPPER(funcName) \
namespace { \
template <typename ReturnType, typename... Args> \
struct funcName##Loader { \
static ReturnType lazilyLoadAndInvoke(Args... args) { \
static decltype(::funcName)* f; \
static std::once_flag once; \
std::call_once(once, [&]() { \
NVFUSER_CUDA_RT_SAFE_CALL(cudaGetDriverEntryPoint( \
#funcName, reinterpret_cast<void**>(&f), cudaEnableDefault)); \
}); \
return f(args...); \
} \
/* This ctor is just a CTAD helper, it is only used in a */ \
/* non-evaluated environment*/ \
funcName##Loader(ReturnType(Args...)){}; \
}; \
\
/* Use CTAD rule to deduct return and argument types */ \
template <typename ReturnType, typename... Args> \
funcName##Loader(ReturnType(Args...)) \
->funcName##Loader<ReturnType, Args...>; \
} /* namespace */ \
\
decltype(::funcName)* funcName = \
#define DEFINE_DRIVER_API_WRAPPER(funcName, version) \
namespace { \
template <typename ReturnType, typename... Args> \
struct funcName##Loader { \
static ReturnType lazilyLoadAndInvoke(Args... args) { \
static decltype(::funcName)* f; \
static std::once_flag once; \
std::call_once(once, [&]() { \
NVFUSER_CUDA_RT_SAFE_CALL(cudaGetDriverEntryPointByVersion( \
#funcName, \
reinterpret_cast<void**>(&f), \
version, \
cudaEnableDefault)); \
}); \
return f(args...); \
} \
/* This ctor is just a CTAD helper, it is only used in a */ \
/* non-evaluated environment*/ \
funcName##Loader(ReturnType(Args...)){}; \
}; \
\
/* Use CTAD rule to deduct return and argument types */ \
template <typename ReturnType, typename... Args> \
funcName##Loader(ReturnType(Args...)) \
->funcName##Loader<ReturnType, Args...>; \
} /* namespace */ \
\
decltype(::funcName)* funcName = \
decltype(funcName##Loader(::funcName))::lazilyLoadAndInvoke

namespace nvfuser {
Expand Down
65 changes: 43 additions & 22 deletions csrc/driver_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#pragma once

#include <cuda.h>
#include <cudaTypedefs.h>

// How to lazily load a driver API and invoke it? Just forget about lazy loading
// and write code as if you are using the driver API directly. Magic will
Expand All @@ -17,33 +16,55 @@

namespace nvfuser {

#define DECLARE_DRIVER_API_WRAPPER(funcName) \
#define DECLARE_DRIVER_API_WRAPPER(funcName, version) \
extern decltype(::funcName)* funcName

// List of driver APIs that you want the magic to happen.
#define ALL_DRIVER_API_WRAPPER_CUDA11(fn) \
fn(cuDeviceGetAttribute); \
fn(cuDeviceGetName); \
fn(cuFuncGetAttribute); \
fn(cuFuncSetAttribute); \
fn(cuGetErrorName); \
fn(cuGetErrorString); \
fn(cuLaunchCooperativeKernel); \
fn(cuLaunchKernel); \
fn(cuModuleGetFunction); \
fn(cuModuleLoadDataEx); \
fn(cuModuleUnload); \
fn(cuStreamWriteValue32); \
fn(cuStreamWaitValue32); \
fn(cuMemGetAddressRange); \
fn(cuOccupancyMaxActiveBlocksPerMultiprocessor)
//
// The second argument is the CUDA_VERSION **requested** for the driver API.
// It's fine if the **actual** CUDA_VERSION is larger than this. For max
// compatibility, this requested CUDA_VERSION should be as low as possible, as
// long as it supports the capabilities that nvFuser requires.
//
// nvFuser is expected to support only CUDA_VERSION >= 11000, so I didn't try
// to go lower than that.
#define ALL_DRIVER_API_WRAPPER_CUDA(fn) \
fn(cuDeviceGetAttribute, 11000); \
fn(cuDeviceGetName, 11000); \
fn(cuFuncGetAttribute, 11000); \
fn(cuFuncSetAttribute, 11000); \
fn(cuGetErrorName, 11000); \
fn(cuGetErrorString, 11000); \
fn(cuLaunchCooperativeKernel, 11000); \
fn(cuLaunchKernel, 11000); \
fn(cuModuleGetFunction, 11000); \
fn(cuModuleLoadDataEx, 11000); \
fn(cuModuleUnload, 11000); \
fn(cuMemGetAddressRange, 11000); \
fn(cuOccupancyMaxActiveBlocksPerMultiprocessor, 11000)

// Stream memory operations (e.g. cuStreamWriteValue32) are specified for both
// 11 and 12+. In CUDA 11, these operations require NVreg_EnableStreamMemOPs=1
// to be explicitly enabled. CUDA 12+ removed this requirement. Therefore, we
// try to request version 12000 whenever it's available.
//
// Details: CUDA 11.7 introduced _v2 of these APIs, which removed the above
// NVreg_EnableStreamMemOPs=1 requirement. In CUDA 12, these _v2 APIs are
// integrated into the vanilla APIs and are therefore removed. Refer to
// https://docs.nvidia.com/cuda/archive/11.7.1/cuda-driver-api/group__CUDA__MEMOP.html
#if (CUDA_VERSION >= 12000)
#define ALL_DRIVER_API_WRAPPER(fn) \
ALL_DRIVER_API_WRAPPER_CUDA11(fn); \
fn(cuTensorMapEncodeTiled)
#define ALL_DRIVER_API_WRAPPER(fn) \
ALL_DRIVER_API_WRAPPER_CUDA(fn); \
fn(cuStreamWaitValue32, 12000); \
fn(cuStreamWriteValue32, 12000); \
fn(cuTensorMapEncodeTiled, 12000)
#elif (CUDA_VERSION >= 11000)
#define ALL_DRIVER_API_WRAPPER(fn) \
ALL_DRIVER_API_WRAPPER_CUDA(fn); \
fn(cuStreamWaitValue32, 11000); \
fn(cuStreamWriteValue32, 11000)
#else
#define ALL_DRIVER_API_WRAPPER ALL_DRIVER_API_WRAPPER_CUDA11
#error "CUDA_VERSION < 11000 isn't supported."
#endif

ALL_DRIVER_API_WRAPPER(DECLARE_DRIVER_API_WRAPPER);
Expand Down
21 changes: 0 additions & 21 deletions tests/cpp/test_multidevice_ipc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,25 +185,4 @@ TEST_F(IpcTest, IpcMemHandlePtrArithmeticAtSender) {
NVFUSER_CUDA_RT_SAFE_CALL(cudaFree(d_ptr));
}

// cuStreamWriteValue32 and cuStreamWaitValue32 are CUDA driver API used in the
// context of synchronization in p2p communication over cudaIpcHandle
using StreamOpTest = NVFuserTest;
TEST_F(StreamOpTest, StreamWriteValue32) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test_driver_api.cc already has a similar test

cudaStream_t stream;
void* buf;
int value = 0;
constexpr int new_value = 42;
NVFUSER_CUDA_RT_SAFE_CALL(cudaSetDevice(0));
NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamCreate(&stream));
NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc(&buf, sizeof(int)));
NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
buf, &value, sizeof(int), cudaMemcpyHostToDevice, stream));
NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
stream, (CUdeviceptr)buf, new_value, CU_STREAM_WRITE_VALUE_DEFAULT));
NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
&value, buf, sizeof(int), cudaMemcpyDeviceToHost, stream));
NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamSynchronize(stream));
EXPECT_EQ(value, new_value);
}

} // namespace nvfuser