From 8c85eb8ee200e4fa92523d09c107bfd0b366ed01 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <2453524+malfet@users.noreply.github.com>
Date: Tue, 16 Jul 2024 12:49:42 -0700
Subject: [PATCH 1/2] [Runtime] Dynamically load cuTensorMapEncodeTiled (#4330)

That is only present in CUDA-12 compatible drivers, and is missing in
CUDA-11 ones

Spiritual follow up after
https://github.com/triton-lang/triton/pull/2771 allows for dynamic query
of the symbol and if run on an older driver, it will return an error.
Also, fix `occupancyMaxActiveClusters` behavior when symbol is not found
(before this change it would crash with null pointer deref, now it
should return a structured exception)
---
 third_party/nvidia/backend/driver.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/third_party/nvidia/backend/driver.c b/third_party/nvidia/backend/driver.c
index 609e829609c1..dce2e861a306 100644
--- a/third_party/nvidia/backend/driver.c
+++ b/third_party/nvidia/backend/driver.c
@@ -143,6 +143,14 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) {
 typedef CUresult (*cuOccupancyMaxActiveClusters_t)(
     int *numClusters, CUfunction func, const CUlaunchConfig *config);
 
+typedef CUresult (*cuTensorMapEncodeTiled_t)(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
+    const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill);
+
 #define defineGetFunctionHandle(name, symbolName)                              \
   static symbolName##_t name() {                                               \
     /* Open the shared library */                                              \
@@ -168,6 +176,9 @@ typedef CUresult (*cuOccupancyMaxActiveClusters_t)(
 defineGetFunctionHandle(getCuOccupancyMaxActiveClustersHandle,
                         cuOccupancyMaxActiveClusters);
 
+defineGetFunctionHandle(getCuTensorMapEncodeTiledHandle,
+                        cuTensorMapEncodeTiled);
+
 static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) {
   int clusterDimX = -1, clusterDimY = -1, clusterDimZ = -1,
       maxActiveClusters = -1;
@@ -206,6 +217,9 @@ static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) {
   static cuOccupancyMaxActiveClusters_t cuOccupancyMaxActiveClusters = NULL;
   if (cuOccupancyMaxActiveClusters == NULL) {
     cuOccupancyMaxActiveClusters = getCuOccupancyMaxActiveClustersHandle();
+    if (cuOccupancyMaxActiveClusters == NULL) {
+      return NULL;
+    }
   }
 
   Py_BEGIN_ALLOW_THREADS;
@@ -289,6 +303,13 @@ static PyObject *fill1DTMADescriptor(PyObject *self, PyObject *args) {
   }
   assert((elementSize * tensorDim) >= 32 && "block size too small.");
   int rank = 1;
+  static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL;
+  if (cuTensorMapEncodeTiled == NULL) {
+    cuTensorMapEncodeTiled = getCuTensorMapEncodeTiledHandle();
+    if (cuTensorMapEncodeTiled == NULL) {
+      return NULL;
+    }
+  }
   CUresult result = cuTensorMapEncodeTiled(
       (CUtensorMap *)desc, type, rank, (void *)global_address, dims,
       globalStrides, boxDim, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,

From be60ccefe5184afd89c7a73dbe48d392a092e24f Mon Sep 17 00:00:00 2001
From: Keren Zhou <kerenzhou@openai.com>
Date: Wed, 17 Jul 2024 00:33:59 -0400
Subject: [PATCH 2/2] [RUNTIME] Fix the function lookup problem for CUDA 11
 driver (#4335)

There was a function pointer lookup missing in the previous patch.
https://github.com/triton-lang/triton/commit/f9f2960deef376da4ebc1ff8b1546051c66894a4
---
 third_party/nvidia/backend/driver.c | 30 +++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/third_party/nvidia/backend/driver.c b/third_party/nvidia/backend/driver.c
index dce2e861a306..44524da27288 100644
--- a/third_party/nvidia/backend/driver.c
+++ b/third_party/nvidia/backend/driver.c
@@ -39,6 +39,17 @@ static bool gpuAssert(CUresult code, const char *file, int line) {
     }                                                                          \
   } while (0)
 
+// Used to check if functions exist in old CUDA driver versions.
+#define INITIALIZE_FUNCTION_POINTER_IF_NULL(funcPointer, initializerFunction)  \
+  do {                                                                         \
+    if ((funcPointer) == NULL) {                                               \
+      (funcPointer) = (initializerFunction)();                                 \
+      if ((funcPointer) == NULL) {                                             \
+        return NULL;                                                           \
+      }                                                                        \
+    }                                                                          \
+  } while (0)
+
 static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
   int device_id;
   if (!PyArg_ParseTuple(args, "i", &device_id))
@@ -215,12 +226,8 @@ static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) {
   config.attrs = launchAttr;
 
   static cuOccupancyMaxActiveClusters_t cuOccupancyMaxActiveClusters = NULL;
-  if (cuOccupancyMaxActiveClusters == NULL) {
-    cuOccupancyMaxActiveClusters = getCuOccupancyMaxActiveClustersHandle();
-    if (cuOccupancyMaxActiveClusters == NULL) {
-      return NULL;
-    }
-  }
+  INITIALIZE_FUNCTION_POINTER_IF_NULL(cuOccupancyMaxActiveClusters,
+                                      getCuOccupancyMaxActiveClustersHandle);
 
   Py_BEGIN_ALLOW_THREADS;
   CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute(
@@ -304,12 +311,8 @@ static PyObject *fill1DTMADescriptor(PyObject *self, PyObject *args) {
   assert((elementSize * tensorDim) >= 32 && "block size too small.");
   int rank = 1;
   static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL;
-  if (cuTensorMapEncodeTiled == NULL) {
-    cuTensorMapEncodeTiled = getCuTensorMapEncodeTiledHandle();
-    if (cuTensorMapEncodeTiled == NULL) {
-      return NULL;
-    }
-  }
+  INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled,
+                                      getCuTensorMapEncodeTiledHandle);
   CUresult result = cuTensorMapEncodeTiled(
       (CUtensorMap *)desc, type, rank, (void *)global_address, dims,
       globalStrides, boxDim, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,
@@ -371,6 +374,9 @@ static PyObject *fill2DTMADescriptor(PyObject *self, PyObject *args) {
   if (contigDimSizeInByte > 128) {
     tensorDims[0] = 128 / elementSize;
   }
+  static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL;
+  INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled,
+                                      getCuTensorMapEncodeTiledHandle);
   CUresult result = cuTensorMapEncodeTiled(
       (CUtensorMap *)desc, type, rank, (void *)global_address, dims,
       globalStrides, tensorDims, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,