NVIDIA · isVoid · Apr 22, 2025 · Apr 22, 2025
diff --git a/numba_cuda/numba/cuda/cudadecl.py b/numba_cuda/numba/cuda/cudadecl.py
@@ -100,43 +100,43 @@ class Cuda_syncwarp(ConcreteTemplate):
     cases = [signature(types.none), signature(types.none, types.i4)]
 
 
-@register
-class Cuda_shfl_sync_intrinsic(ConcreteTemplate):
-    key = cuda.shfl_sync_intrinsic
-    cases = [
-        signature(
-            types.Tuple((types.i4, types.b1)),
-            types.i4,
-            types.i4,
-            types.i4,
-            types.i4,
-            types.i4,
-        ),
-        signature(
-            types.Tuple((types.i8, types.b1)),
-            types.i4,
-            types.i4,
-            types.i8,
-            types.i4,
-            types.i4,
-        ),
-        signature(
-            types.Tuple((types.f4, types.b1)),
-            types.i4,
-            types.i4,
-            types.f4,
-            types.i4,
-            types.i4,
-        ),
-        signature(
-            types.Tuple((types.f8, types.b1)),
-            types.i4,
-            types.i4,
-            types.f8,
-            types.i4,
-            types.i4,
-        ),
-    ]
+# @register
+# class Cuda_shfl_sync_intrinsic(ConcreteTemplate):
+#    key = cuda.shfl_sync_intrinsic
+#    cases = [
+#        signature(
+#            types.Tuple((types.i4, types.b1)),
+#            types.i4,
+#            types.i4,
+#            types.i4,
+#            types.i4,
+#            types.i4,
+#        ),
+#        signature(
+#            types.Tuple((types.i8, types.b1)),
+#            types.i4,
+#            types.i4,
+#            types.i8,
+#            types.i4,
+#            types.i4,
+#        ),
+#        signature(
+#            types.Tuple((types.f4, types.b1)),
+#            types.i4,
+#            types.i4,
+#            types.f4,
+#            types.i4,
+#            types.i4,
+#        ),
+#        signature(
+#            types.Tuple((types.f8, types.b1)),
+#            types.i4,
+#            types.i4,
+#            types.f8,
+#            types.i4,
+#            types.i4,
+#        ),
+#    ]
 
 
 @register
@@ -815,8 +815,8 @@ def resolve_threadfence_system(self, mod):
     def resolve_syncwarp(self, mod):
         return types.Function(Cuda_syncwarp)
 
-    def resolve_shfl_sync_intrinsic(self, mod):
-        return types.Function(Cuda_shfl_sync_intrinsic)
+    # def resolve_shfl_sync_intrinsic(self, mod):
+    #    return types.Function(Cuda_shfl_sync_intrinsic)
 
     def resolve_vote_sync_intrinsic(self, mod):
         return types.Function(Cuda_vote_sync_intrinsic)

diff --git a/numba_cuda/numba/cuda/cudaimpl.py b/numba_cuda/numba/cuda/cudaimpl.py
@@ -204,67 +204,68 @@ def ptx_syncwarp_mask(context, builder, sig, args):
     return context.get_dummy_value()
 
 
-@lower(
-    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4, types.i4
-)
-@lower(
-    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4, types.i4
-)
-@lower(
-    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4, types.i4
-)
-@lower(
-    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4, types.i4
-)
-def ptx_shfl_sync_i32(context, builder, sig, args):
-    """
-    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
-    function supports both 32 and 64 bit ints and floats, so for feature parity,
-    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
-    an int, then shuffling, then bitcasting back. And 64-bit values by packing
-    them into 2 32bit values, shuffling thoose, and then packing back together.
-    """
-    mask, mode, value, index, clamp = args
-    value_type = sig.args[2]
-    if value_type in types.real_domain:
-        value = builder.bitcast(value, ir.IntType(value_type.bitwidth))
-    fname = "llvm.nvvm.shfl.sync.i32"
-    lmod = builder.module
-    fnty = ir.FunctionType(
-        ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
-        (
-            ir.IntType(32),
-            ir.IntType(32),
-            ir.IntType(32),
-            ir.IntType(32),
-            ir.IntType(32),
-        ),
-    )
-    func = cgutils.get_or_insert_function(lmod, fnty, fname)
-    if value_type.bitwidth == 32:
-        ret = builder.call(func, (mask, mode, value, index, clamp))
-        if value_type == types.float32:
-            rv = builder.extract_value(ret, 0)
-            pred = builder.extract_value(ret, 1)
-            fv = builder.bitcast(rv, ir.FloatType())
-            ret = cgutils.make_anonymous_struct(builder, (fv, pred))
-    else:
-        value1 = builder.trunc(value, ir.IntType(32))
-        value_lshr = builder.lshr(value, context.get_constant(types.i8, 32))
-        value2 = builder.trunc(value_lshr, ir.IntType(32))
-        ret1 = builder.call(func, (mask, mode, value1, index, clamp))
-        ret2 = builder.call(func, (mask, mode, value2, index, clamp))
-        rv1 = builder.extract_value(ret1, 0)
-        rv2 = builder.extract_value(ret2, 0)
-        pred = builder.extract_value(ret1, 1)
-        rv1_64 = builder.zext(rv1, ir.IntType(64))
-        rv2_64 = builder.zext(rv2, ir.IntType(64))
-        rv_shl = builder.shl(rv2_64, context.get_constant(types.i8, 32))
-        rv = builder.or_(rv_shl, rv1_64)
-        if value_type == types.float64:
-            rv = builder.bitcast(rv, ir.DoubleType())
-        ret = cgutils.make_anonymous_struct(builder, (rv, pred))
-    return ret
+# @lower(
+#    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4, types.i4
+# )
+# @lower(
+#    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4, types.i4
+# )
+# @lower(
+#    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4, types.i4
+# )
+# @lower(
+#    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4, types.i4
+# )
+# def ptx_shfl_sync_i32(context, builder, sig, args):
+#    """
+#    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
+#    function supports both 32 and 64 bit ints and floats, so for feature parity,
+#    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
+#    an int, then shuffling, then bitcasting back. And 64-bit values by packing
+#    them into 2 32bit values, shuffling thoose, and then packing back together.
+#    """
+#    mask, mode, value, index, clamp = args
+#    value_type = sig.args[2]
+#    if value_type in types.real_domain:
+#        value = builder.bitcast(value, ir.IntType(value_type.bitwidth))
+#    fname = "llvm.nvvm.shfl.sync.i32"
+#    lmod = builder.module
+#    fnty = ir.FunctionType(
+#        ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
+#        (
+#            ir.IntType(32),
+#            ir.IntType(32),
+#            ir.IntType(32),
+#            ir.IntType(32),
+#            ir.IntType(32),
+#        ),
+#    )
+#    func = cgutils.get_or_insert_function(lmod, fnty, fname)
+#    if value_type.bitwidth == 32:
+#        ret = builder.call(func, (mask, mode, value, index, clamp))
+#        if value_type == types.float32:
+#            rv = builder.extract_value(ret, 0)
+#            pred = builder.extract_value(ret, 1)
+#            fv = builder.bitcast(rv, ir.FloatType())
+#            ret = cgutils.make_anonymous_struct(builder, (fv, pred))
+#    else:
+#        value1 = builder.trunc(value, ir.IntType(32))
+#        value_lshr = builder.lshr(value, context.get_constant(types.i8, 32))
+#        value2 = builder.trunc(value_lshr, ir.IntType(32))
+#        ret1 = builder.call(func, (mask, mode, value1, index, clamp))
+#        ret2 = builder.call(func, (mask, mode, value2, index, clamp))
+#        rv1 = builder.extract_value(ret1, 0)
+#        rv2 = builder.extract_value(ret2, 0)
+#        pred = builder.extract_value(ret1, 1)
+#        rv1_64 = builder.zext(rv1, ir.IntType(64))
+#        rv2_64 = builder.zext(rv2, ir.IntType(64))
+#        rv_shl = builder.shl(rv2_64, context.get_constant(types.i8, 32))
+#        rv = builder.or_(rv_shl, rv1_64)
+#        if value_type == types.float64:
+#            rv = builder.bitcast(rv, ir.DoubleType())
+#        ret = cgutils.make_anonymous_struct(builder, (rv, pred))
+#    return ret
+#
 
 
 @lower(stubs.vote_sync_intrinsic, types.i4, types.i4, types.boolean)

diff --git a/numba_cuda/numba/cuda/device_init.py b/numba_cuda/numba/cuda/device_init.py
@@ -13,7 +13,7 @@
     local,
     const,
     atomic,
-    shfl_sync_intrinsic,
+    #shfl_sync_intrinsic,
     vote_sync_intrinsic,
     match_any_sync,
     match_all_sync,
@@ -40,6 +40,10 @@
     syncthreads_and,
     syncthreads_count,
     syncthreads_or,
+    shfl_sync,
+    shfl_up_sync,
+    shfl_down_sync,
+    shfl_xor_sync,
 )
 from .cudadrv.error import CudaSupportError
 from numba.cuda.cudadrv.driver import (
@@ -68,10 +72,6 @@
     any_sync,
     eq_sync,
     ballot_sync,
-    shfl_sync,
-    shfl_up_sync,
-    shfl_down_sync,
-    shfl_xor_sync,
 )
 
 from .kernels import reduction

diff --git a/numba_cuda/numba/cuda/intrinsic_wrapper.py b/numba_cuda/numba/cuda/intrinsic_wrapper.py
@@ -38,40 +38,40 @@ def ballot_sync(mask, predicate):
     return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
 
 
-@jit(device=True)
-def shfl_sync(mask, value, src_lane):
-    """
-    Shuffles value across the masked warp and returns the value
-    from src_lane. If this is outside the warp, then the
-    given value is returned.
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0]
-
-
-@jit(device=True)
-def shfl_up_sync(mask, value, delta):
-    """
-    Shuffles value across the masked warp and returns the value
-    from (laneid - delta). If this is outside the warp, then the
-    given value is returned.
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
-
-
-@jit(device=True)
-def shfl_down_sync(mask, value, delta):
-    """
-    Shuffles value across the masked warp and returns the value
-    from (laneid + delta). If this is outside the warp, then the
-    given value is returned.
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0]
-
-
-@jit(device=True)
-def shfl_xor_sync(mask, value, lane_mask):
-    """
-    Shuffles value across the masked warp and returns the value
-    from (laneid ^ lane_mask).
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0]
+# @jit(device=True)
+# def shfl_sync(mask, value, src_lane):
+#    """
+#    Shuffles value across the masked warp and returns the value
+#    from src_lane. If this is outside the warp, then the
+#    given value is returned.
+#    """
+#   return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0]
+
+
+# @jit(device=True)
+# def shfl_up_sync(mask, value, delta):
+#    """
+#    Shuffles value across the masked warp and returns the value
+#    from (laneid - delta). If this is outside the warp, then the
+#    given value is returned.
+#    """
+#    return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
+#
+#
+# @jit(device=True)
+# def shfl_down_sync(mask, value, delta):
+#    """
+#    Shuffles value across the masked warp and returns the value
+#    from (laneid + delta). If this is outside the warp, then the
+#    given value is returned.
+#    """
+#    return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0]
+#
+#
+# @jit(device=True)
+# def shfl_xor_sync(mask, value, lane_mask):
+#    """
+#    Shuffles value across the masked warp and returns the value
+#    from (laneid ^ lane_mask).
+#    """
+#    return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0]