Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
beeadd1
basic test, bits and pieces of nrt needed
brandon-b-miller Oct 14, 2025
634e976
some progress
brandon-b-miller Oct 14, 2025
9e12144
Refactor
brandon-b-miller Oct 15, 2025
c4d4abe
more reductions
brandon-b-miller Oct 15, 2025
06226f2
fix ufuncs
brandon-b-miller Oct 15, 2025
2f48967
Merge branch 'main' into vendor-test-array-reductions
brandon-b-miller Oct 15, 2025
6611805
enable nrt
brandon-b-miller Oct 15, 2025
7bc54a0
fixes
brandon-b-miller Oct 15, 2025
8f87a44
pass
brandon-b-miller Oct 16, 2025
9969223
faster?
brandon-b-miller Oct 16, 2025
3ad3e1f
Merge branch 'main' into vendor-test-array-reductions
brandon-b-miller Oct 21, 2025
ddb0062
Merge branch 'main' into vendor-test-array-reductions
brandon-b-miller Oct 22, 2025
398489b
Apply suggestions from code review
brandon-b-miller Oct 22, 2025
b58d757
export
brandon-b-miller Oct 22, 2025
a7e24cc
basic
brandon-b-miller Oct 23, 2025
14026e8
add test that exhibits the failure
brandon-b-miller Oct 23, 2025
10f21fd
implement fix
brandon-b-miller Oct 23, 2025
51cf513
Merge branch 'fix-numpy-array-device-copy' into vendor-test-array-red…
brandon-b-miller Oct 24, 2025
054041f
updates
brandon-b-miller Oct 27, 2025
7f5a647
merge/resolve
brandon-b-miller Oct 31, 2025
0595cb9
move files, add NRTEnablingCUDATestCase
brandon-b-miller Oct 31, 2025
108df98
cleanup
brandon-b-miller Oct 31, 2025
dd1de26
more tests, buggy copy exposed again
brandon-b-miller Oct 31, 2025
5aa441a
partial list impl
brandon-b-miller Nov 4, 2025
9e4ddfe
merge/refactor
brandon-b-miller Nov 4, 2025
5ab95bc
more tests
brandon-b-miller Nov 5, 2025
ebb65ef
Merge branch 'main' into vendor-test-array-reductions-2
brandon-b-miller Nov 12, 2025
12ef058
renaming
brandon-b-miller Nov 12, 2025
814f800
merge/resolve
brandon-b-miller Dec 11, 2025
ebfd572
merge/resolve
brandon-b-miller Jan 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 23 additions & 29 deletions numba_cuda/numba/cuda/np/arrayobj.py
Original file line number Diff line number Diff line change
Expand Up @@ -5504,37 +5504,31 @@ def _array_copy(context, builder, sig, args):
dest_data = ret.data

assert rettype.layout in "CF"
if arytype.layout == rettype.layout:
# Fast path: memcpy
cgutils.raw_memcpy(
builder, dest_data, src_data, ary.nitems, ary.itemsize, align=1
)

else:
src_strides = cgutils.unpack_tuple(builder, ary.strides)
dest_strides = cgutils.unpack_tuple(builder, ret.strides)
intp_t = context.get_value_type(types.intp)
src_strides = cgutils.unpack_tuple(builder, ary.strides)
dest_strides = cgutils.unpack_tuple(builder, ret.strides)
intp_t = context.get_value_type(types.intp)

with cgutils.loop_nest(builder, shapes, intp_t) as indices:
src_ptr = cgutils.get_item_pointer2(
context,
builder,
src_data,
shapes,
src_strides,
arytype.layout,
indices,
)
dest_ptr = cgutils.get_item_pointer2(
context,
builder,
dest_data,
shapes,
dest_strides,
rettype.layout,
indices,
)
builder.store(builder.load(src_ptr), dest_ptr)
with cgutils.loop_nest(builder, shapes, intp_t) as indices:
src_ptr = cgutils.get_item_pointer2(
context,
builder,
src_data,
shapes,
src_strides,
arytype.layout,
indices,
)
dest_ptr = cgutils.get_item_pointer2(
context,
builder,
dest_data,
shapes,
dest_strides,
rettype.layout,
indices,
)
builder.store(builder.load(src_ptr), dest_ptr)
Comment on lines 5506 to +5531
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logic: removed memcpy fast path optimization when source and destination layouts match - this causes performance regression for same-layout copies

the original code had:

if arytype.layout == rettype.layout:
    # Fast path: memcpy
    cgutils.raw_memcpy(builder, dest_data, src_data, ary.nitems, ary.itemsize, align=1)

consider restoring the fast path for performance


return impl_ret_new_ref(context, builder, sig.return_type, ret._getvalue())

Expand Down
28 changes: 28 additions & 0 deletions numba_cuda/numba/cuda/target.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,34 @@ def mangler(self, name, argtypes, *, abi_tags=(), uid=None):
name, argtypes, abi_tags=abi_tags, uid=uid
)

def make_constant_list(self, builder, listty, lst):
import numpy as np

constvals = [
self.get_constant(listty.dtype, i) for i in iter(np.array(lst))
]
instance = self.build_list(builder, listty, constvals)
# create constant address space version of the list
lmod = builder.module

constlistty = instance.type
constlist = ir.Constant(constlistty, instance)
addrspace = nvvm.ADDRSPACE_CONSTANT
gv = cgutils.add_global_variable(
lmod, constlist.type, "_cudapy_clist", addrspace=addrspace
)
gv.linkage = "internal"
gv.global_constant = True
gv.initializer = constlist

# Convert to generic address-space
ptrty = ir.PointerType(constlistty)
genptr = builder.addrspacecast(gv, ptrty, "generic")
lst = cgutils.create_struct_proxy(listty)(
self, builder, value=builder.load(genptr)
)
return lst._getvalue()

def make_constant_array(self, builder, aryty, arr):
"""
Unlike the parent version. This returns a a pointer in the constant
Expand Down
11 changes: 11 additions & 0 deletions numba_cuda/numba/cuda/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,17 @@ def assertFileCheckMatches(
)


class NRTEnablingCUDATestCase(CUDATestCase):
def setUp(self):
self.old_nrt_setting = config.CUDA_ENABLE_NRT
config.CUDA_ENABLE_NRT = True
super().setUp()

def tearDown(self):
config.CUDA_ENABLE_NRT = self.old_nrt_setting
super().tearDown()


def skip_on_cudasim(reason):
"""Skip this test if running on the CUDA simulator"""
assert isinstance(reason, str)
Expand Down
14 changes: 2 additions & 12 deletions numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@

import numpy as np
from numba import cuda
from numba.cuda.testing import CUDATestCase
from numba.cuda.testing import NRTEnablingCUDATestCase
import unittest
from numba.cuda import config


def reinterpret_array_type(byte_arr, start, stop, output):
Expand All @@ -14,16 +13,7 @@ def reinterpret_array_type(byte_arr, start, stop, output):
output[0] = val


class TestCudaArrayMethods(CUDATestCase):
def setUp(self):
self.old_nrt_setting = config.CUDA_ENABLE_NRT
config.CUDA_ENABLE_NRT = True
super(TestCudaArrayMethods, self).setUp()

def tearDown(self):
config.CUDA_ENABLE_NRT = self.old_nrt_setting
super(TestCudaArrayMethods, self).tearDown()

class TestCudaArrayMethods(NRTEnablingCUDATestCase):
def test_reinterpret_array_type(self):
"""
Reinterpret byte array as int32 in the GPU.
Expand Down
Loading
Loading