From 8c85eb8ee200e4fa92523d09c107bfd0b366ed01 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Tue, 16 Jul 2024 12:49:42 -0700 Subject: [PATCH 1/2] [Runtime] Dynamically load cuTensorMapEncodeTiled (#4330) That is only present in CUDA-12 compatible drivers, and is missing in CUDA-11 ones Spiritual follow up after https://github.com/triton-lang/triton/pull/2771 allows for dynamic query of the symbol and if run on an older driver, it will return an error. Also, fix `occupancyMaxActiveClusters` behavior when symbol is not found (before this change it would crash with null pointer deref, now it should return a structured exception) --- third_party/nvidia/backend/driver.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/third_party/nvidia/backend/driver.c b/third_party/nvidia/backend/driver.c index 609e829609c1..dce2e861a306 100644 --- a/third_party/nvidia/backend/driver.c +++ b/third_party/nvidia/backend/driver.c @@ -143,6 +143,14 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) { typedef CUresult (*cuOccupancyMaxActiveClusters_t)( int *numClusters, CUfunction func, const CUlaunchConfig *config); +typedef CUresult (*cuTensorMapEncodeTiled_t)( + CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, + cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, + const cuuint64_t *globalStrides, const cuuint32_t *boxDim, + const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, + CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, + CUtensorMapFloatOOBfill oobFill); + #define defineGetFunctionHandle(name, symbolName) \ static symbolName##_t name() { \ /* Open the shared library */ \ @@ -168,6 +176,9 @@ typedef CUresult (*cuOccupancyMaxActiveClusters_t)( defineGetFunctionHandle(getCuOccupancyMaxActiveClustersHandle, cuOccupancyMaxActiveClusters); +defineGetFunctionHandle(getCuTensorMapEncodeTiledHandle, + cuTensorMapEncodeTiled); + static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) { int clusterDimX = -1, clusterDimY = -1, clusterDimZ = -1, maxActiveClusters = -1; @@ -206,6 +217,9 @@ static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) { static cuOccupancyMaxActiveClusters_t cuOccupancyMaxActiveClusters = NULL; if (cuOccupancyMaxActiveClusters == NULL) { cuOccupancyMaxActiveClusters = getCuOccupancyMaxActiveClustersHandle(); + if (cuOccupancyMaxActiveClusters == NULL) { + return NULL; + } } Py_BEGIN_ALLOW_THREADS; @@ -289,6 +303,13 @@ static PyObject *fill1DTMADescriptor(PyObject *self, PyObject *args) { } assert((elementSize * tensorDim) >= 32 && "block size too small."); int rank = 1; + static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL; + if (cuTensorMapEncodeTiled == NULL) { + cuTensorMapEncodeTiled = getCuTensorMapEncodeTiledHandle(); + if (cuTensorMapEncodeTiled == NULL) { + return NULL; + } + } CUresult result = cuTensorMapEncodeTiled( (CUtensorMap *)desc, type, rank, (void *)global_address, dims, globalStrides, boxDim, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE, From be60ccefe5184afd89c7a73dbe48d392a092e24f Mon Sep 17 00:00:00 2001 From: Keren Zhou Date: Wed, 17 Jul 2024 00:33:59 -0400 Subject: [PATCH 2/2] [RUNTIME] Fix the function lookup problem for CUDA 11 driver (#4335) There was a function pointer lookup missing in the previous patch. https://github.com/triton-lang/triton/commit/f9f2960deef376da4ebc1ff8b1546051c66894a4 --- third_party/nvidia/backend/driver.c | 30 +++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/third_party/nvidia/backend/driver.c b/third_party/nvidia/backend/driver.c index dce2e861a306..44524da27288 100644 --- a/third_party/nvidia/backend/driver.c +++ b/third_party/nvidia/backend/driver.c @@ -39,6 +39,17 @@ static bool gpuAssert(CUresult code, const char *file, int line) { } \ } while (0) +// Used to check if functions exist in old CUDA driver versions. +#define INITIALIZE_FUNCTION_POINTER_IF_NULL(funcPointer, initializerFunction) \ + do { \ + if ((funcPointer) == NULL) { \ + (funcPointer) = (initializerFunction)(); \ + if ((funcPointer) == NULL) { \ + return NULL; \ + } \ + } \ + } while (0) + static PyObject *getDeviceProperties(PyObject *self, PyObject *args) { int device_id; if (!PyArg_ParseTuple(args, "i", &device_id)) @@ -215,12 +226,8 @@ static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) { config.attrs = launchAttr; static cuOccupancyMaxActiveClusters_t cuOccupancyMaxActiveClusters = NULL; - if (cuOccupancyMaxActiveClusters == NULL) { - cuOccupancyMaxActiveClusters = getCuOccupancyMaxActiveClustersHandle(); - if (cuOccupancyMaxActiveClusters == NULL) { - return NULL; - } - } + INITIALIZE_FUNCTION_POINTER_IF_NULL(cuOccupancyMaxActiveClusters, + getCuOccupancyMaxActiveClustersHandle); Py_BEGIN_ALLOW_THREADS; CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute( @@ -304,12 +311,8 @@ static PyObject *fill1DTMADescriptor(PyObject *self, PyObject *args) { assert((elementSize * tensorDim) >= 32 && "block size too small."); int rank = 1; static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL; - if (cuTensorMapEncodeTiled == NULL) { - cuTensorMapEncodeTiled = getCuTensorMapEncodeTiledHandle(); - if (cuTensorMapEncodeTiled == NULL) { - return NULL; - } - } + INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled, + getCuTensorMapEncodeTiledHandle); CUresult result = cuTensorMapEncodeTiled( (CUtensorMap *)desc, type, rank, (void *)global_address, dims, globalStrides, boxDim, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE, @@ -371,6 +374,9 @@ static PyObject *fill2DTMADescriptor(PyObject *self, PyObject *args) { if (contigDimSizeInByte > 128) { tensorDims[0] = 128 / elementSize; } + static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL; + INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled, + getCuTensorMapEncodeTiledHandle); CUresult result = cuTensorMapEncodeTiled( (CUtensorMap *)desc, type, rank, (void *)global_address, dims, globalStrides, tensorDims, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,