diff --git a/csrc/driver_api.cpp b/csrc/driver_api.cpp index 497b7babbb6..63e0dd87a3e 100644 --- a/csrc/driver_api.cpp +++ b/csrc/driver_api.cpp @@ -61,31 +61,34 @@ // // Driver APIs are loaded using cudaGetDriverEntryPoint as recommended by // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#using-the-runtime-api -#define DEFINE_DRIVER_API_WRAPPER(funcName) \ - namespace { \ - template \ - struct funcName##Loader { \ - static ReturnType lazilyLoadAndInvoke(Args... args) { \ - static decltype(::funcName)* f; \ - static std::once_flag once; \ - std::call_once(once, [&]() { \ - NVFUSER_CUDA_RT_SAFE_CALL(cudaGetDriverEntryPoint( \ - #funcName, reinterpret_cast(&f), cudaEnableDefault)); \ - }); \ - return f(args...); \ - } \ - /* This ctor is just a CTAD helper, it is only used in a */ \ - /* non-evaluated environment*/ \ - funcName##Loader(ReturnType(Args...)){}; \ - }; \ - \ - /* Use CTAD rule to deduct return and argument types */ \ - template \ - funcName##Loader(ReturnType(Args...)) \ - ->funcName##Loader; \ - } /* namespace */ \ - \ - decltype(::funcName)* funcName = \ +#define DEFINE_DRIVER_API_WRAPPER(funcName, version) \ + namespace { \ + template \ + struct funcName##Loader { \ + static ReturnType lazilyLoadAndInvoke(Args... args) { \ + static decltype(::funcName)* f; \ + static std::once_flag once; \ + std::call_once(once, [&]() { \ + NVFUSER_CUDA_RT_SAFE_CALL(cudaGetDriverEntryPointByVersion( \ + #funcName, \ + reinterpret_cast(&f), \ + version, \ + cudaEnableDefault)); \ + }); \ + return f(args...); \ + } \ + /* This ctor is just a CTAD helper, it is only used in a */ \ + /* non-evaluated environment*/ \ + funcName##Loader(ReturnType(Args...)){}; \ + }; \ + \ + /* Use CTAD rule to deduct return and argument types */ \ + template \ + funcName##Loader(ReturnType(Args...)) \ + ->funcName##Loader; \ + } /* namespace */ \ + \ + decltype(::funcName)* funcName = \ decltype(funcName##Loader(::funcName))::lazilyLoadAndInvoke namespace nvfuser { diff --git a/csrc/driver_api.h b/csrc/driver_api.h index 892bf8e29a9..6067918193e 100644 --- a/csrc/driver_api.h +++ b/csrc/driver_api.h @@ -8,7 +8,6 @@ #pragma once #include -#include // How to lazily load a driver API and invoke it? Just forget about lazy loading // and write code as if you are using the driver API directly. Magic will @@ -17,33 +16,55 @@ namespace nvfuser { -#define DECLARE_DRIVER_API_WRAPPER(funcName) \ +#define DECLARE_DRIVER_API_WRAPPER(funcName, version) \ extern decltype(::funcName)* funcName // List of driver APIs that you want the magic to happen. -#define ALL_DRIVER_API_WRAPPER_CUDA11(fn) \ - fn(cuDeviceGetAttribute); \ - fn(cuDeviceGetName); \ - fn(cuFuncGetAttribute); \ - fn(cuFuncSetAttribute); \ - fn(cuGetErrorName); \ - fn(cuGetErrorString); \ - fn(cuLaunchCooperativeKernel); \ - fn(cuLaunchKernel); \ - fn(cuModuleGetFunction); \ - fn(cuModuleLoadDataEx); \ - fn(cuModuleUnload); \ - fn(cuStreamWriteValue32); \ - fn(cuStreamWaitValue32); \ - fn(cuMemGetAddressRange); \ - fn(cuOccupancyMaxActiveBlocksPerMultiprocessor) +// +// The second argument is the CUDA_VERSION **requested** for the driver API. +// It's fine if the **actual** CUDA_VERSION is larger than this. For max +// compatibility, this requested CUDA_VERSION should be as low as possible, as +// long as it supports the capabilities that nvFuser requires. +// +// nvFuser is expected to support only CUDA_VERSION >= 11000, so I didn't try +// to go lower than that. +#define ALL_DRIVER_API_WRAPPER_CUDA(fn) \ + fn(cuDeviceGetAttribute, 11000); \ + fn(cuDeviceGetName, 11000); \ + fn(cuFuncGetAttribute, 11000); \ + fn(cuFuncSetAttribute, 11000); \ + fn(cuGetErrorName, 11000); \ + fn(cuGetErrorString, 11000); \ + fn(cuLaunchCooperativeKernel, 11000); \ + fn(cuLaunchKernel, 11000); \ + fn(cuModuleGetFunction, 11000); \ + fn(cuModuleLoadDataEx, 11000); \ + fn(cuModuleUnload, 11000); \ + fn(cuMemGetAddressRange, 11000); \ + fn(cuOccupancyMaxActiveBlocksPerMultiprocessor, 11000) +// Stream memory operations (e.g. cuStreamWriteValue32) are specified for both +// 11 and 12+. In CUDA 11, these operations require NVreg_EnableStreamMemOPs=1 +// to be explicitly enabled. CUDA 12+ removed this requirement. Therefore, we +// try to request version 12000 whenever it's available. +// +// Details: CUDA 11.7 introduced _v2 of these APIs, which removed the above +// NVreg_EnableStreamMemOPs=1 requirement. In CUDA 12, these _v2 APIs are +// integrated into the vanilla APIs and are therefore removed. Refer to +// https://docs.nvidia.com/cuda/archive/11.7.1/cuda-driver-api/group__CUDA__MEMOP.html #if (CUDA_VERSION >= 12000) -#define ALL_DRIVER_API_WRAPPER(fn) \ - ALL_DRIVER_API_WRAPPER_CUDA11(fn); \ - fn(cuTensorMapEncodeTiled) +#define ALL_DRIVER_API_WRAPPER(fn) \ + ALL_DRIVER_API_WRAPPER_CUDA(fn); \ + fn(cuStreamWaitValue32, 12000); \ + fn(cuStreamWriteValue32, 12000); \ + fn(cuTensorMapEncodeTiled, 12000) +#elif (CUDA_VERSION >= 11000) +#define ALL_DRIVER_API_WRAPPER(fn) \ + ALL_DRIVER_API_WRAPPER_CUDA(fn); \ + fn(cuStreamWaitValue32, 11000); \ + fn(cuStreamWriteValue32, 11000) #else -#define ALL_DRIVER_API_WRAPPER ALL_DRIVER_API_WRAPPER_CUDA11 +#error "CUDA_VERSION < 11000 isn't supported." #endif ALL_DRIVER_API_WRAPPER(DECLARE_DRIVER_API_WRAPPER); diff --git a/tests/cpp/test_multidevice_ipc.cpp b/tests/cpp/test_multidevice_ipc.cpp index ba574c0f676..73922b6868a 100644 --- a/tests/cpp/test_multidevice_ipc.cpp +++ b/tests/cpp/test_multidevice_ipc.cpp @@ -185,25 +185,4 @@ TEST_F(IpcTest, IpcMemHandlePtrArithmeticAtSender) { NVFUSER_CUDA_RT_SAFE_CALL(cudaFree(d_ptr)); } -// cuStreamWriteValue32 and cuStreamWaitValue32 are CUDA driver API used in the -// context of synchronization in p2p communication over cudaIpcHandle -using StreamOpTest = NVFuserTest; -TEST_F(StreamOpTest, StreamWriteValue32) { - cudaStream_t stream; - void* buf; - int value = 0; - constexpr int new_value = 42; - NVFUSER_CUDA_RT_SAFE_CALL(cudaSetDevice(0)); - NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamCreate(&stream)); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc(&buf, sizeof(int))); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( - buf, &value, sizeof(int), cudaMemcpyHostToDevice, stream)); - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( - stream, (CUdeviceptr)buf, new_value, CU_STREAM_WRITE_VALUE_DEFAULT)); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( - &value, buf, sizeof(int), cudaMemcpyDeviceToHost, stream)); - NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamSynchronize(stream)); - EXPECT_EQ(value, new_value); -} - } // namespace nvfuser