dlopen fix for win32

* based on Windows support PR #2456 by @andreigh * DISPATCH_ARGS fix by @andreigh * WIN32 fix using LoadLibrary
triton-lang · Dec 4, 2023 · a349d21 · a349d21
1 parent d5c4b30
commit a349d21
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 0 deletions.
diff --git a/python/triton/compiler/make_launcher.py b/python/triton/compiler/make_launcher.py
@@ -109,7 +109,12 @@ def format_of(ty):
 #include \"cuda.h\"
 #include <stdbool.h>
 #include <Python.h>
+#ifndef _WIN32
 #include <dlfcn.h>
+#else
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
 
 static inline void gpuAssert(CUresult code, const char *file, int line)
 {{
@@ -132,6 +137,7 @@ def format_of(ty):
 
 typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);
 
+#ifndef _WIN32
 static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
   // Open the shared library
   void* handle = dlopen("libcuda.so", RTLD_LAZY);
@@ -150,6 +156,25 @@ def format_of(ty):
   }}
   return cuLaunchKernelExHandle;
 }}
+#else
+static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
+  // Open the shared library
+  HMODULE handle = LoadLibraryA("nvcuda.dll");
+  if (!handle) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to open nvcuda.dll");
+    return NULL;
+  }}
+  cuLaunchKernelEx_t cuLaunchKernelExHandle =
+      (cuLaunchKernelEx_t)GetProcAddress((HMODULE)handle, "cuLaunchKernelEx");
+  // Check for errors
+  long error = GetLastError();
+  if (error) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from nvcuda.dll");
+    return NULL;
+  }}
+  return cuLaunchKernelExHandle;
+}}
+#endif
 
 static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
   void *params[] = {{ {', '.join(f"&arg{i}" for i in params)} }};

diff --git a/python/triton/runtime/backends/cuda.c b/python/triton/runtime/backends/cuda.c
@@ -1,5 +1,10 @@
 #include "cuda.h"
+#ifndef _WIN32
 #include <dlfcn.h>
+#else
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
 #include <stdbool.h>
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
@@ -94,10 +99,17 @@ static bool gpuAssert(CUresult code, const char *file, int line) {
 #define DISPATCH_ARGS_N(_14, _13, _12, _11, _10, _9, _8, _7, _6, _5, _4, _3,   \
                         _2, _1, N, ...)                                        \
   ADD_ENUM_ITEM_##N
+#if !defined(_MSC_VER) || defined(__clang__)
 #define DISPATCH_ARGS(...)                                                     \
   DISPATCH_ARGS_N(__VA_ARGS__, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,  \
                   0)                                                           \
   (__VA_ARGS__)
+#else
+#define EXPAND_ARGS(args) args
+#define DISPATCH_ARGS(...)                                                     \
+  DISPATCH_ARGS_N EXPAND_ARGS((__VA_ARGS__, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, \
+                               4, 3, 2, 1, 0))(__VA_ARGS__)
+#endif
 
 #define ADD_ENUM_TO_MODULE(module, enum_name, ...)                             \
   do {                                                                         \
@@ -377,6 +389,7 @@ typedef CUresult (*cuTensorMapEncodeTiled_t)(
     CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
     CUtensorMapFloatOOBfill oobFill);
 
+#ifndef _WIN32
 static cuTensorMapEncodeTiled_t getCuTensorMapEncodeTiledHandle() {
   // Open the shared library
   void *handle = dlopen("libcuda.so", RTLD_LAZY);
@@ -398,6 +411,28 @@ static cuTensorMapEncodeTiled_t getCuTensorMapEncodeTiledHandle() {
   }
   return cuTensorMapEncodeTiledHandle;
 }
+#else
+static cuTensorMapEncodeTiled_t getCuTensorMapEncodeTiledHandle() {
+  // Open the shared library
+  HMODULE handle = LoadLibraryA("nvcuda.dll");
+  if (!handle) {
+    PyErr_SetString(PyExc_RuntimeError, "Failed to open nvcuda.dll");
+    return NULL;
+  }
+  cuTensorMapEncodeTiled_t cuTensorMapEncodeTiledHandle =
+      (cuTensorMapEncodeTiled_t)GetProcAddress((HMODULE)handle,
+                                               "cuTensorMapEncodeTiled");
+  // Check for errors
+  long error = GetLastError();
+  if (error) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        "Failed to retrieve cuTensorMapEncodeTiled from nvcuda.dll");
+    return NULL;
+  }
+  return cuTensorMapEncodeTiledHandle;
+}
+#endif
 
 static PyObject *tensorMapEncodeTiled(PyObject *self, PyObject *args) {
   CUtensorMap *tensorMap = (CUtensorMap *)malloc(sizeof(CUtensorMap));