NVIDIA · wujingyue · Apr 30, 2025 · Apr 30, 2025 · Apr 30, 2025 · wujingyue
diff --git a/csrc/driver_api.cpp b/csrc/driver_api.cpp
@@ -61,31 +61,34 @@
 //
 // Driver APIs are loaded using cudaGetDriverEntryPoint as recommended by
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#using-the-runtime-api
-#define DEFINE_DRIVER_API_WRAPPER(funcName)                               \
-  namespace {                                                             \
-  template <typename ReturnType, typename... Args>                        \
-  struct funcName##Loader {                                               \
-    static ReturnType lazilyLoadAndInvoke(Args... args) {                 \
-      static decltype(::funcName)* f;                                     \
-      static std::once_flag once;                                         \
-      std::call_once(once, [&]() {                                        \
-        NVFUSER_CUDA_RT_SAFE_CALL(cudaGetDriverEntryPoint(                \
-            #funcName, reinterpret_cast<void**>(&f), cudaEnableDefault)); \
-      });                                                                 \
-      return f(args...);                                                  \
-    }                                                                     \
-    /* This ctor is just a CTAD helper, it is only used in a */           \
-    /* non-evaluated environment*/                                        \
-    funcName##Loader(ReturnType(Args...)){};                              \
-  };                                                                      \
-                                                                          \
-  /* Use CTAD rule to deduct return and argument types */                 \
-  template <typename ReturnType, typename... Args>                        \
-  funcName##Loader(ReturnType(Args...))                                   \
-      ->funcName##Loader<ReturnType, Args...>;                            \
-  } /* namespace */                                                       \
-                                                                          \
-  decltype(::funcName)* funcName =                                        \
+#define DEFINE_DRIVER_API_WRAPPER(funcName, version)                \
+  namespace {                                                       \
+  template <typename ReturnType, typename... Args>                  \
+  struct funcName##Loader {                                         \
+    static ReturnType lazilyLoadAndInvoke(Args... args) {           \
+      static decltype(::funcName)* f;                               \
+      static std::once_flag once;                                   \
+      std::call_once(once, [&]() {                                  \
+        NVFUSER_CUDA_RT_SAFE_CALL(cudaGetDriverEntryPointByVersion( \
+            #funcName,                                              \
+            reinterpret_cast<void**>(&f),                           \
+            version,                                                \
+            cudaEnableDefault));                                    \
+      });                                                           \
+      return f(args...);                                            \
+    }                                                               \
+    /* This ctor is just a CTAD helper, it is only used in a */     \
+    /* non-evaluated environment*/                                  \
+    funcName##Loader(ReturnType(Args...)){};                        \
+  };                                                                \
+                                                                    \
+  /* Use CTAD rule to deduct return and argument types */           \
+  template <typename ReturnType, typename... Args>                  \
+  funcName##Loader(ReturnType(Args...))                             \
+      ->funcName##Loader<ReturnType, Args...>;                      \
+  } /* namespace */                                                 \
+                                                                    \
+  decltype(::funcName)* funcName =                                  \
       decltype(funcName##Loader(::funcName))::lazilyLoadAndInvoke
 
 namespace nvfuser {

diff --git a/csrc/driver_api.h b/csrc/driver_api.h
@@ -8,7 +8,6 @@
 #pragma once
 
 #include <cuda.h>
-#include <cudaTypedefs.h>
 
 // How to lazily load a driver API and invoke it? Just forget about lazy loading
 // and write code as if you are using the driver API directly. Magic will
@@ -17,33 +16,55 @@
 
 namespace nvfuser {
 
-#define DECLARE_DRIVER_API_WRAPPER(funcName) \
+#define DECLARE_DRIVER_API_WRAPPER(funcName, version) \
   extern decltype(::funcName)* funcName
 
 // List of driver APIs that you want the magic to happen.
-#define ALL_DRIVER_API_WRAPPER_CUDA11(fn) \
-  fn(cuDeviceGetAttribute);               \
-  fn(cuDeviceGetName);                    \
-  fn(cuFuncGetAttribute);                 \
-  fn(cuFuncSetAttribute);                 \
-  fn(cuGetErrorName);                     \
-  fn(cuGetErrorString);                   \
-  fn(cuLaunchCooperativeKernel);          \
-  fn(cuLaunchKernel);                     \
-  fn(cuModuleGetFunction);                \
-  fn(cuModuleLoadDataEx);                 \
-  fn(cuModuleUnload);                     \
-  fn(cuStreamWriteValue32);               \
-  fn(cuStreamWaitValue32);                \
-  fn(cuMemGetAddressRange);               \
-  fn(cuOccupancyMaxActiveBlocksPerMultiprocessor)
+//
+// The second argument is the CUDA_VERSION **requested** for the driver API.
+// It's fine if the **actual** CUDA_VERSION is larger than this. For max
+// compatibility, this requested CUDA_VERSION should be as low as possible, as
+// long as it supports the capabilities that nvFuser requires.
+//
+// nvFuser is expected to support only CUDA_VERSION >= 11000, so I didn't try
+// to go lower than that.
+#define ALL_DRIVER_API_WRAPPER_CUDA(fn) \
+  fn(cuDeviceGetAttribute, 11000);      \
+  fn(cuDeviceGetName, 11000);           \
+  fn(cuFuncGetAttribute, 11000);        \
+  fn(cuFuncSetAttribute, 11000);        \
+  fn(cuGetErrorName, 11000);            \
+  fn(cuGetErrorString, 11000);          \
+  fn(cuLaunchCooperativeKernel, 11000); \
+  fn(cuLaunchKernel, 11000);            \
+  fn(cuModuleGetFunction, 11000);       \
+  fn(cuModuleLoadDataEx, 11000);        \
+  fn(cuModuleUnload, 11000);            \
+  fn(cuMemGetAddressRange, 11000);      \
+  fn(cuOccupancyMaxActiveBlocksPerMultiprocessor, 11000)
 
+// Stream memory operations (e.g. cuStreamWriteValue32) are specified for both
+// 11 and 12+. In CUDA 11, these operations require NVreg_EnableStreamMemOPs=1
+// to be explicitly enabled. CUDA 12+ removed this requirement. Therefore, we
+// try to request version 12000 whenever it's available.
+//
+// Details: CUDA 11.7 introduced _v2 of these APIs, which removed the above
+// NVreg_EnableStreamMemOPs=1 requirement. In CUDA 12, these _v2 APIs are
+// integrated into the vanilla APIs and are therefore removed. Refer to
+// https://docs.nvidia.com/cuda/archive/11.7.1/cuda-driver-api/group__CUDA__MEMOP.html
 #if (CUDA_VERSION >= 12000)
-#define ALL_DRIVER_API_WRAPPER(fn)   \
-  ALL_DRIVER_API_WRAPPER_CUDA11(fn); \
-  fn(cuTensorMapEncodeTiled)
+#define ALL_DRIVER_API_WRAPPER(fn) \
+  ALL_DRIVER_API_WRAPPER_CUDA(fn); \
+  fn(cuStreamWaitValue32, 12000);  \
+  fn(cuStreamWriteValue32, 12000); \
+  fn(cuTensorMapEncodeTiled, 12000)
+#elif (CUDA_VERSION >= 11000)
+#define ALL_DRIVER_API_WRAPPER(fn) \
+  ALL_DRIVER_API_WRAPPER_CUDA(fn); \
+  fn(cuStreamWaitValue32, 11000);  \
+  fn(cuStreamWriteValue32, 11000)
 #else
-#define ALL_DRIVER_API_WRAPPER ALL_DRIVER_API_WRAPPER_CUDA11
+#error "CUDA_VERSION < 11000 isn't supported."
 #endif
 
 ALL_DRIVER_API_WRAPPER(DECLARE_DRIVER_API_WRAPPER);

diff --git a/tests/cpp/test_multidevice_ipc.cpp b/tests/cpp/test_multidevice_ipc.cpp
@@ -185,25 +185,4 @@ TEST_F(IpcTest, IpcMemHandlePtrArithmeticAtSender) {
   NVFUSER_CUDA_RT_SAFE_CALL(cudaFree(d_ptr));
 }
 
-// cuStreamWriteValue32 and cuStreamWaitValue32 are CUDA driver API used in the
-// context of synchronization in p2p communication over cudaIpcHandle
-using StreamOpTest = NVFuserTest;
-TEST_F(StreamOpTest, StreamWriteValue32) {
-  cudaStream_t stream;
-  void* buf;
-  int value = 0;
-  constexpr int new_value = 42;
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaSetDevice(0));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamCreate(&stream));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc(&buf, sizeof(int)));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
-      buf, &value, sizeof(int), cudaMemcpyHostToDevice, stream));
-  NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
-      stream, (CUdeviceptr)buf, new_value, CU_STREAM_WRITE_VALUE_DEFAULT));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
-      &value, buf, sizeof(int), cudaMemcpyDeviceToHost, stream));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamSynchronize(stream));
-  EXPECT_EQ(value, new_value);
-}
-
 } // namespace nvfuser