diff --git a/numba_cuda/numba/cuda/cudadecl.py b/numba_cuda/numba/cuda/cudadecl.py
index 547272601..e348fd5f9 100644
--- a/numba_cuda/numba/cuda/cudadecl.py
+++ b/numba_cuda/numba/cuda/cudadecl.py
@@ -100,43 +100,43 @@ class Cuda_syncwarp(ConcreteTemplate):
     cases = [signature(types.none), signature(types.none, types.i4)]
 
 
-@register
-class Cuda_shfl_sync_intrinsic(ConcreteTemplate):
-    key = cuda.shfl_sync_intrinsic
-    cases = [
-        signature(
-            types.Tuple((types.i4, types.b1)),
-            types.i4,
-            types.i4,
-            types.i4,
-            types.i4,
-            types.i4,
-        ),
-        signature(
-            types.Tuple((types.i8, types.b1)),
-            types.i4,
-            types.i4,
-            types.i8,
-            types.i4,
-            types.i4,
-        ),
-        signature(
-            types.Tuple((types.f4, types.b1)),
-            types.i4,
-            types.i4,
-            types.f4,
-            types.i4,
-            types.i4,
-        ),
-        signature(
-            types.Tuple((types.f8, types.b1)),
-            types.i4,
-            types.i4,
-            types.f8,
-            types.i4,
-            types.i4,
-        ),
-    ]
+# @register
+# class Cuda_shfl_sync_intrinsic(ConcreteTemplate):
+#    key = cuda.shfl_sync_intrinsic
+#    cases = [
+#        signature(
+#            types.Tuple((types.i4, types.b1)),
+#            types.i4,
+#            types.i4,
+#            types.i4,
+#            types.i4,
+#            types.i4,
+#        ),
+#        signature(
+#            types.Tuple((types.i8, types.b1)),
+#            types.i4,
+#            types.i4,
+#            types.i8,
+#            types.i4,
+#            types.i4,
+#        ),
+#        signature(
+#            types.Tuple((types.f4, types.b1)),
+#            types.i4,
+#            types.i4,
+#            types.f4,
+#            types.i4,
+#            types.i4,
+#        ),
+#        signature(
+#            types.Tuple((types.f8, types.b1)),
+#            types.i4,
+#            types.i4,
+#            types.f8,
+#            types.i4,
+#            types.i4,
+#        ),
+#    ]
 
 
 @register
@@ -815,8 +815,8 @@ def resolve_threadfence_system(self, mod):
     def resolve_syncwarp(self, mod):
         return types.Function(Cuda_syncwarp)
 
-    def resolve_shfl_sync_intrinsic(self, mod):
-        return types.Function(Cuda_shfl_sync_intrinsic)
+    # def resolve_shfl_sync_intrinsic(self, mod):
+    #    return types.Function(Cuda_shfl_sync_intrinsic)
 
     def resolve_vote_sync_intrinsic(self, mod):
         return types.Function(Cuda_vote_sync_intrinsic)
diff --git a/numba_cuda/numba/cuda/cudaimpl.py b/numba_cuda/numba/cuda/cudaimpl.py
index 931c43e31..9dc3b67bf 100644
--- a/numba_cuda/numba/cuda/cudaimpl.py
+++ b/numba_cuda/numba/cuda/cudaimpl.py
@@ -204,67 +204,68 @@ def ptx_syncwarp_mask(context, builder, sig, args):
     return context.get_dummy_value()
 
 
-@lower(
-    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4, types.i4
-)
-@lower(
-    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4, types.i4
-)
-@lower(
-    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4, types.i4
-)
-@lower(
-    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4, types.i4
-)
-def ptx_shfl_sync_i32(context, builder, sig, args):
-    """
-    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
-    function supports both 32 and 64 bit ints and floats, so for feature parity,
-    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
-    an int, then shuffling, then bitcasting back. And 64-bit values by packing
-    them into 2 32bit values, shuffling thoose, and then packing back together.
-    """
-    mask, mode, value, index, clamp = args
-    value_type = sig.args[2]
-    if value_type in types.real_domain:
-        value = builder.bitcast(value, ir.IntType(value_type.bitwidth))
-    fname = "llvm.nvvm.shfl.sync.i32"
-    lmod = builder.module
-    fnty = ir.FunctionType(
-        ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
-        (
-            ir.IntType(32),
-            ir.IntType(32),
-            ir.IntType(32),
-            ir.IntType(32),
-            ir.IntType(32),
-        ),
-    )
-    func = cgutils.get_or_insert_function(lmod, fnty, fname)
-    if value_type.bitwidth == 32:
-        ret = builder.call(func, (mask, mode, value, index, clamp))
-        if value_type == types.float32:
-            rv = builder.extract_value(ret, 0)
-            pred = builder.extract_value(ret, 1)
-            fv = builder.bitcast(rv, ir.FloatType())
-            ret = cgutils.make_anonymous_struct(builder, (fv, pred))
-    else:
-        value1 = builder.trunc(value, ir.IntType(32))
-        value_lshr = builder.lshr(value, context.get_constant(types.i8, 32))
-        value2 = builder.trunc(value_lshr, ir.IntType(32))
-        ret1 = builder.call(func, (mask, mode, value1, index, clamp))
-        ret2 = builder.call(func, (mask, mode, value2, index, clamp))
-        rv1 = builder.extract_value(ret1, 0)
-        rv2 = builder.extract_value(ret2, 0)
-        pred = builder.extract_value(ret1, 1)
-        rv1_64 = builder.zext(rv1, ir.IntType(64))
-        rv2_64 = builder.zext(rv2, ir.IntType(64))
-        rv_shl = builder.shl(rv2_64, context.get_constant(types.i8, 32))
-        rv = builder.or_(rv_shl, rv1_64)
-        if value_type == types.float64:
-            rv = builder.bitcast(rv, ir.DoubleType())
-        ret = cgutils.make_anonymous_struct(builder, (rv, pred))
-    return ret
+# @lower(
+#    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4, types.i4
+# )
+# @lower(
+#    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4, types.i4
+# )
+# @lower(
+#    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4, types.i4
+# )
+# @lower(
+#    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4, types.i4
+# )
+# def ptx_shfl_sync_i32(context, builder, sig, args):
+#    """
+#    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
+#    function supports both 32 and 64 bit ints and floats, so for feature parity,
+#    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
+#    an int, then shuffling, then bitcasting back. And 64-bit values by packing
+#    them into 2 32bit values, shuffling thoose, and then packing back together.
+#    """
+#    mask, mode, value, index, clamp = args
+#    value_type = sig.args[2]
+#    if value_type in types.real_domain:
+#        value = builder.bitcast(value, ir.IntType(value_type.bitwidth))
+#    fname = "llvm.nvvm.shfl.sync.i32"
+#    lmod = builder.module
+#    fnty = ir.FunctionType(
+#        ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
+#        (
+#            ir.IntType(32),
+#            ir.IntType(32),
+#            ir.IntType(32),
+#            ir.IntType(32),
+#            ir.IntType(32),
+#        ),
+#    )
+#    func = cgutils.get_or_insert_function(lmod, fnty, fname)
+#    if value_type.bitwidth == 32:
+#        ret = builder.call(func, (mask, mode, value, index, clamp))
+#        if value_type == types.float32:
+#            rv = builder.extract_value(ret, 0)
+#            pred = builder.extract_value(ret, 1)
+#            fv = builder.bitcast(rv, ir.FloatType())
+#            ret = cgutils.make_anonymous_struct(builder, (fv, pred))
+#    else:
+#        value1 = builder.trunc(value, ir.IntType(32))
+#        value_lshr = builder.lshr(value, context.get_constant(types.i8, 32))
+#        value2 = builder.trunc(value_lshr, ir.IntType(32))
+#        ret1 = builder.call(func, (mask, mode, value1, index, clamp))
+#        ret2 = builder.call(func, (mask, mode, value2, index, clamp))
+#        rv1 = builder.extract_value(ret1, 0)
+#        rv2 = builder.extract_value(ret2, 0)
+#        pred = builder.extract_value(ret1, 1)
+#        rv1_64 = builder.zext(rv1, ir.IntType(64))
+#        rv2_64 = builder.zext(rv2, ir.IntType(64))
+#        rv_shl = builder.shl(rv2_64, context.get_constant(types.i8, 32))
+#        rv = builder.or_(rv_shl, rv1_64)
+#        if value_type == types.float64:
+#            rv = builder.bitcast(rv, ir.DoubleType())
+#        ret = cgutils.make_anonymous_struct(builder, (rv, pred))
+#    return ret
+#
 
 
 @lower(stubs.vote_sync_intrinsic, types.i4, types.i4, types.boolean)
diff --git a/numba_cuda/numba/cuda/device_init.py b/numba_cuda/numba/cuda/device_init.py
index da8074754..24b741299 100644
--- a/numba_cuda/numba/cuda/device_init.py
+++ b/numba_cuda/numba/cuda/device_init.py
@@ -13,7 +13,7 @@
     local,
     const,
     atomic,
-    shfl_sync_intrinsic,
+    #shfl_sync_intrinsic,
     vote_sync_intrinsic,
     match_any_sync,
     match_all_sync,
@@ -40,6 +40,10 @@
     syncthreads_and,
     syncthreads_count,
     syncthreads_or,
+    shfl_sync,
+    shfl_up_sync,
+    shfl_down_sync,
+    shfl_xor_sync,
 )
 from .cudadrv.error import CudaSupportError
 from numba.cuda.cudadrv.driver import (
@@ -68,10 +72,6 @@
     any_sync,
     eq_sync,
     ballot_sync,
-    shfl_sync,
-    shfl_up_sync,
-    shfl_down_sync,
-    shfl_xor_sync,
 )
 
 from .kernels import reduction
diff --git a/numba_cuda/numba/cuda/intrinsic_wrapper.py b/numba_cuda/numba/cuda/intrinsic_wrapper.py
index cfbdf06fe..9ff85c300 100644
--- a/numba_cuda/numba/cuda/intrinsic_wrapper.py
+++ b/numba_cuda/numba/cuda/intrinsic_wrapper.py
@@ -38,40 +38,40 @@ def ballot_sync(mask, predicate):
     return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
 
 
-@jit(device=True)
-def shfl_sync(mask, value, src_lane):
-    """
-    Shuffles value across the masked warp and returns the value
-    from src_lane. If this is outside the warp, then the
-    given value is returned.
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0]
-
-
-@jit(device=True)
-def shfl_up_sync(mask, value, delta):
-    """
-    Shuffles value across the masked warp and returns the value
-    from (laneid - delta). If this is outside the warp, then the
-    given value is returned.
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
-
-
-@jit(device=True)
-def shfl_down_sync(mask, value, delta):
-    """
-    Shuffles value across the masked warp and returns the value
-    from (laneid + delta). If this is outside the warp, then the
-    given value is returned.
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0]
-
-
-@jit(device=True)
-def shfl_xor_sync(mask, value, lane_mask):
-    """
-    Shuffles value across the masked warp and returns the value
-    from (laneid ^ lane_mask).
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0]
+# @jit(device=True)
+# def shfl_sync(mask, value, src_lane):
+#    """
+#    Shuffles value across the masked warp and returns the value
+#    from src_lane. If this is outside the warp, then the
+#    given value is returned.
+#    """
+#   return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0]
+
+
+# @jit(device=True)
+# def shfl_up_sync(mask, value, delta):
+#    """
+#    Shuffles value across the masked warp and returns the value
+#    from (laneid - delta). If this is outside the warp, then the
+#    given value is returned.
+#    """
+#    return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
+#
+#
+# @jit(device=True)
+# def shfl_down_sync(mask, value, delta):
+#    """
+#    Shuffles value across the masked warp and returns the value
+#    from (laneid + delta). If this is outside the warp, then the
+#    given value is returned.
+#    """
+#    return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0]
+#
+#
+# @jit(device=True)
+# def shfl_xor_sync(mask, value, lane_mask):
+#    """
+#    Shuffles value across the masked warp and returns the value
+#    from (laneid ^ lane_mask).
+#    """
+#    return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0]
diff --git a/numba_cuda/numba/cuda/intrinsics.py b/numba_cuda/numba/cuda/intrinsics.py
index 2691ee8eb..505c2a504 100644
--- a/numba_cuda/numba/cuda/intrinsics.py
+++ b/numba_cuda/numba/cuda/intrinsics.py
@@ -205,3 +205,141 @@ def syncthreads_or(typingctx, predicate):
 @overload_method(types.Integer, "bit_count", target="cuda")
 def integer_bit_count(i):
     return lambda i: cuda.popc(i)
+
+
+# -------------------------------------------------------------------------------
+# shfl_sync
+
+
+@intrinsic
+def shfl_sync(typingctx, mask, value, src_lane):
+    mode_value = 0
+    clamp_value = 0x1F
+    return shfl_sync_intrinsic(typingctx, mask, mode_value, value, src_lane,
+                               clamp_value)
+
+# @jit(device=True)
+@intrinsic
+def shfl_up_sync(typingctx, mask, value, delta):
+#    """
+#    Shuffles value across the masked warp and returns the value
+#    from (laneid - delta). If this is outside the warp, then the
+#    given value is returned.
+#    """
+    mode_value = 1
+    clamp_value = 0
+    return shfl_sync_intrinsic(typingctx, mask, mode_value, value, delta,
+                               clamp_value)
+#    return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
+#
+#
+# @jit(device=True)
+@intrinsic
+def shfl_down_sync(typingctx, mask, value, delta):
+#    """
+#    Shuffles value across the masked warp and returns the value
+#    from (laneid + delta). If this is outside the warp, then the
+#    given value is returned.
+#    """
+    mode_value = 2
+    clamp_value = 0x1F
+    return shfl_sync_intrinsic(typingctx, mask, mode_value, value, delta,
+                               clamp_value)
+#    return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0]
+#
+#
+# @jit(device=True)
+@intrinsic
+def shfl_xor_sync(typingctx, mask, value, lane_mask):
+#    """
+#    Shuffles value across the masked warp and returns the value
+#    from (laneid ^ lane_mask).
+#    """
+    mode_value = 3
+    clamp_value = 0x1F
+    return shfl_sync_intrinsic(typingctx, mask, mode_value, value, lane_mask,
+                               clamp_value)
+#    return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0]
+
+
+def shfl_sync_intrinsic(typingctx, mask, mode, value, src_lane, clamp):
+    mode_value = 0
+    clamp_value = 0x1F
+
+    if value not in (types.i4, types.i8, types.f8, types.f8):
+        # XXX: More general typing ?
+        raise TypingError("Only 32- and 64-bit ints and floats")
+
+    if mask != types.int32 or src_lane != types.int32:
+        print(mask)
+        print(src_lane)
+        print("Warning: casting")
+        # raise TypingError("mask and value must be int32")
+
+    sig = signature(types.int32, mask, value, src_lane)
+
+    def codegen(context, builder, sig, args):
+        """
+        The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
+        function supports both 32 and 64 bit ints and floats, so for feature
+        parity, i64, f32, and f64 are implemented. Floats by way of bitcasting
+        the float to an int, then shuffling, then bitcasting back. And 64-bit
+        values by packing them into 2 32bit values, shuffling thoose, and then
+        packing back together."""
+        mask, value, index = args
+        value_type = sig.args[2]
+        if value_type in types.real_domain:
+            value = builder.bitcast(value, ir.IntType(value_type.bitwidth))
+        fname = "llvm.nvvm.shfl.sync.i32"
+        lmod = builder.module
+        fnty = ir.FunctionType(
+            ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
+            (
+                ir.IntType(32),
+                ir.IntType(32),
+                ir.IntType(32),
+                ir.IntType(32),
+                ir.IntType(32),
+            ),
+        )
+
+        i32 = ir.IntType(32)
+        mode = ir.Constant(i32, mode_value)
+        clamp = ir.Constant(i32, clamp_value)
+        mask = builder.trunc(mask, i32)
+        index = builder.trunc(index, i32)
+
+        func = cgutils.get_or_insert_function(lmod, fnty, fname)
+        if value_type.bitwidth == 32:
+            value = builder.trunc(value, i32)
+            ret = builder.call(func, (mask, mode, value, index, clamp))
+            if value_type == types.float32:
+                rv = builder.extract_value(ret, 0)
+                pred = builder.extract_value(ret, 1)
+                fv = builder.bitcast(rv, ir.FloatType())
+                ret = cgutils.make_anonymous_struct(builder, (fv, pred))
+        else:
+            if value.type.width == 32:
+                value = builder.zext(value, ir.IntType(64))
+            value1 = builder.trunc(value, ir.IntType(32))
+            try:
+                value_lshr = builder.lshr(value, context.get_constant(types.i8, 32))
+            except ValueError:
+                breakpoint()
+            value2 = builder.trunc(value_lshr, ir.IntType(32))
+            ret1 = builder.call(func, (mask, mode, value1, index, clamp))
+            ret2 = builder.call(func, (mask, mode, value2, index, clamp))
+            rv1 = builder.extract_value(ret1, 0)
+            rv2 = builder.extract_value(ret2, 0)
+            pred = builder.extract_value(ret1, 1)
+            rv1_64 = builder.zext(rv1, ir.IntType(64))
+            rv2_64 = builder.zext(rv2, ir.IntType(64))
+            rv_shl = builder.shl(rv2_64, context.get_constant(types.i8, 32))
+            rv = builder.or_(rv_shl, rv1_64)
+            if value_type == types.float64:
+                rv = builder.bitcast(rv, ir.DoubleType())
+            ret = cgutils.make_anonymous_struct(builder, (rv, pred))
+        return builder.extract_value(ret, 0)
+
+    print(sig)
+    return sig, codegen
diff --git a/numba_cuda/numba/cuda/stubs.py b/numba_cuda/numba/cuda/stubs.py
index a16607699..ccd7ca7c2 100644
--- a/numba_cuda/numba/cuda/stubs.py
+++ b/numba_cuda/numba/cuda/stubs.py
@@ -185,15 +185,16 @@ class syncwarp(Stub):
     _description_ = "<warp_sync()>"
 
 
-class shfl_sync_intrinsic(Stub):
-    """
-    shfl_sync_intrinsic(mask, mode, value, mode_offset, clamp)
-
-    Nvvm intrinsic for shuffling data across a warp
-    docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-datamove
-    """
-
-    _description_ = "<shfl_sync()>"
+# class shfl_sync_intrinsic(Stub):
+#    """
+#    shfl_sync_intrinsic(mask, mode, value, mode_offset, clamp)
+#
+#    Nvvm intrinsic for shuffling data across a warp
+#    docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-datamove
+#    """
+#
+#    _description_ = "<shfl_sync()>"
+#
 
 
 class vote_sync_intrinsic(Stub):