Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 6 additions & 47 deletions numba_cuda/numba/cuda/cudadrv/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,7 @@
import re
from itertools import product
from abc import ABCMeta, abstractmethod
from ctypes import (
c_int,
byref,
c_size_t,
c_void_p,
c_uint8,
)
from ctypes import c_int, byref, c_void_p, c_uint8
import contextlib
import importlib
import numpy as np
Expand All @@ -47,7 +41,7 @@
from numba.cuda import utils, serialize
from .error import CudaSupportError, CudaDriverError
from .drvapi import API_PROTOTYPES
from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj
from .drvapi import cu_stream_callback_pyobj
from .mappings import FILE_EXTENSION_MAP
from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
from numba.cuda.utils import cached_file_read
Expand Down Expand Up @@ -1179,45 +1173,10 @@ def get_max_potential_block_size(
:param blocksizelimit: maximum block size the kernel is designed to
handle
"""
args = (func, b2d_func, memsize, blocksizelimit, flags)
return self._cuda_python_max_potential_block_size(*args)

def _ctypes_max_potential_block_size(
self, func, b2d_func, memsize, blocksizelimit, flags
):
gridsize = c_int()
blocksize = c_int()
b2d_cb = cu_occupancy_b2d_size(b2d_func)
args = [
byref(gridsize),
byref(blocksize),
func.handle,
b2d_cb,
memsize,
blocksizelimit,
]

if not flags:
driver.cuOccupancyMaxPotentialBlockSize(*args)
else:
args.append(flags)
driver.cuOccupancyMaxPotentialBlockSizeWithFlags(*args)

return (gridsize.value, blocksize.value)

def _cuda_python_max_potential_block_size(
self, func, b2d_func, memsize, blocksizelimit, flags
):
b2d_cb = ctypes.CFUNCTYPE(c_size_t, c_int)(b2d_func)
ptr = int.from_bytes(b2d_cb, byteorder="little")
driver_b2d_cb = binding.CUoccupancyB2DSize(ptr)
args = [func.handle, driver_b2d_cb, memsize, blocksizelimit]

if not flags:
return driver.cuOccupancyMaxPotentialBlockSize(*args)
else:
args.append(flags)
return driver.cuOccupancyMaxPotentialBlockSizeWithFlags(*args)
return (
binding.CUresult.CUDA_SUCCESS,
func.kernel.attributes.max_threads_per_block(),
)
Comment on lines +1176 to +1179
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ignores b2d_func, memsize, blocksizelimit, and flags parameters. Previous implementation computed optimal block size via cuOccupancyMaxPotentialBlockSize considering dynamic shared memory. Now returns hardware maximum, which may be suboptimal for kernels with significant shared memory usage.


def prepare_for_use(self):
"""Initialize the context for use.
Expand Down
9 changes: 0 additions & 9 deletions numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,15 +263,6 @@ def test_cuda_driver_occupancy(self):
)
self.assertTrue(value > 0)

def b2d(bs):
return bs

grid, block = self.context.get_max_potential_block_size(
function, b2d, 128, 128
)
self.assertTrue(grid > 0)
self.assertTrue(block > 0)

def test_cuda_cache_config(self):
from numba import types
import numpy as np
Expand Down
Loading