diff --git a/numba_cuda/numba/cuda/cudadecl.py b/numba_cuda/numba/cuda/cudadecl.py index 547272601..e348fd5f9 100644 --- a/numba_cuda/numba/cuda/cudadecl.py +++ b/numba_cuda/numba/cuda/cudadecl.py @@ -100,43 +100,43 @@ class Cuda_syncwarp(ConcreteTemplate): cases = [signature(types.none), signature(types.none, types.i4)] -@register -class Cuda_shfl_sync_intrinsic(ConcreteTemplate): - key = cuda.shfl_sync_intrinsic - cases = [ - signature( - types.Tuple((types.i4, types.b1)), - types.i4, - types.i4, - types.i4, - types.i4, - types.i4, - ), - signature( - types.Tuple((types.i8, types.b1)), - types.i4, - types.i4, - types.i8, - types.i4, - types.i4, - ), - signature( - types.Tuple((types.f4, types.b1)), - types.i4, - types.i4, - types.f4, - types.i4, - types.i4, - ), - signature( - types.Tuple((types.f8, types.b1)), - types.i4, - types.i4, - types.f8, - types.i4, - types.i4, - ), - ] +# @register +# class Cuda_shfl_sync_intrinsic(ConcreteTemplate): +# key = cuda.shfl_sync_intrinsic +# cases = [ +# signature( +# types.Tuple((types.i4, types.b1)), +# types.i4, +# types.i4, +# types.i4, +# types.i4, +# types.i4, +# ), +# signature( +# types.Tuple((types.i8, types.b1)), +# types.i4, +# types.i4, +# types.i8, +# types.i4, +# types.i4, +# ), +# signature( +# types.Tuple((types.f4, types.b1)), +# types.i4, +# types.i4, +# types.f4, +# types.i4, +# types.i4, +# ), +# signature( +# types.Tuple((types.f8, types.b1)), +# types.i4, +# types.i4, +# types.f8, +# types.i4, +# types.i4, +# ), +# ] @register @@ -815,8 +815,8 @@ def resolve_threadfence_system(self, mod): def resolve_syncwarp(self, mod): return types.Function(Cuda_syncwarp) - def resolve_shfl_sync_intrinsic(self, mod): - return types.Function(Cuda_shfl_sync_intrinsic) + # def resolve_shfl_sync_intrinsic(self, mod): + # return types.Function(Cuda_shfl_sync_intrinsic) def resolve_vote_sync_intrinsic(self, mod): return types.Function(Cuda_vote_sync_intrinsic) diff --git a/numba_cuda/numba/cuda/cudaimpl.py b/numba_cuda/numba/cuda/cudaimpl.py index 931c43e31..9dc3b67bf 100644 --- a/numba_cuda/numba/cuda/cudaimpl.py +++ b/numba_cuda/numba/cuda/cudaimpl.py @@ -204,67 +204,68 @@ def ptx_syncwarp_mask(context, builder, sig, args): return context.get_dummy_value() -@lower( - stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4, types.i4 -) -@lower( - stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4, types.i4 -) -@lower( - stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4, types.i4 -) -@lower( - stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4, types.i4 -) -def ptx_shfl_sync_i32(context, builder, sig, args): - """ - The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic - function supports both 32 and 64 bit ints and floats, so for feature parity, - i64, f32, and f64 are implemented. Floats by way of bitcasting the float to - an int, then shuffling, then bitcasting back. And 64-bit values by packing - them into 2 32bit values, shuffling thoose, and then packing back together. - """ - mask, mode, value, index, clamp = args - value_type = sig.args[2] - if value_type in types.real_domain: - value = builder.bitcast(value, ir.IntType(value_type.bitwidth)) - fname = "llvm.nvvm.shfl.sync.i32" - lmod = builder.module - fnty = ir.FunctionType( - ir.LiteralStructType((ir.IntType(32), ir.IntType(1))), - ( - ir.IntType(32), - ir.IntType(32), - ir.IntType(32), - ir.IntType(32), - ir.IntType(32), - ), - ) - func = cgutils.get_or_insert_function(lmod, fnty, fname) - if value_type.bitwidth == 32: - ret = builder.call(func, (mask, mode, value, index, clamp)) - if value_type == types.float32: - rv = builder.extract_value(ret, 0) - pred = builder.extract_value(ret, 1) - fv = builder.bitcast(rv, ir.FloatType()) - ret = cgutils.make_anonymous_struct(builder, (fv, pred)) - else: - value1 = builder.trunc(value, ir.IntType(32)) - value_lshr = builder.lshr(value, context.get_constant(types.i8, 32)) - value2 = builder.trunc(value_lshr, ir.IntType(32)) - ret1 = builder.call(func, (mask, mode, value1, index, clamp)) - ret2 = builder.call(func, (mask, mode, value2, index, clamp)) - rv1 = builder.extract_value(ret1, 0) - rv2 = builder.extract_value(ret2, 0) - pred = builder.extract_value(ret1, 1) - rv1_64 = builder.zext(rv1, ir.IntType(64)) - rv2_64 = builder.zext(rv2, ir.IntType(64)) - rv_shl = builder.shl(rv2_64, context.get_constant(types.i8, 32)) - rv = builder.or_(rv_shl, rv1_64) - if value_type == types.float64: - rv = builder.bitcast(rv, ir.DoubleType()) - ret = cgutils.make_anonymous_struct(builder, (rv, pred)) - return ret +# @lower( +# stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4, types.i4 +# ) +# @lower( +# stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4, types.i4 +# ) +# @lower( +# stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4, types.i4 +# ) +# @lower( +# stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4, types.i4 +# ) +# def ptx_shfl_sync_i32(context, builder, sig, args): +# """ +# The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic +# function supports both 32 and 64 bit ints and floats, so for feature parity, +# i64, f32, and f64 are implemented. Floats by way of bitcasting the float to +# an int, then shuffling, then bitcasting back. And 64-bit values by packing +# them into 2 32bit values, shuffling thoose, and then packing back together. +# """ +# mask, mode, value, index, clamp = args +# value_type = sig.args[2] +# if value_type in types.real_domain: +# value = builder.bitcast(value, ir.IntType(value_type.bitwidth)) +# fname = "llvm.nvvm.shfl.sync.i32" +# lmod = builder.module +# fnty = ir.FunctionType( +# ir.LiteralStructType((ir.IntType(32), ir.IntType(1))), +# ( +# ir.IntType(32), +# ir.IntType(32), +# ir.IntType(32), +# ir.IntType(32), +# ir.IntType(32), +# ), +# ) +# func = cgutils.get_or_insert_function(lmod, fnty, fname) +# if value_type.bitwidth == 32: +# ret = builder.call(func, (mask, mode, value, index, clamp)) +# if value_type == types.float32: +# rv = builder.extract_value(ret, 0) +# pred = builder.extract_value(ret, 1) +# fv = builder.bitcast(rv, ir.FloatType()) +# ret = cgutils.make_anonymous_struct(builder, (fv, pred)) +# else: +# value1 = builder.trunc(value, ir.IntType(32)) +# value_lshr = builder.lshr(value, context.get_constant(types.i8, 32)) +# value2 = builder.trunc(value_lshr, ir.IntType(32)) +# ret1 = builder.call(func, (mask, mode, value1, index, clamp)) +# ret2 = builder.call(func, (mask, mode, value2, index, clamp)) +# rv1 = builder.extract_value(ret1, 0) +# rv2 = builder.extract_value(ret2, 0) +# pred = builder.extract_value(ret1, 1) +# rv1_64 = builder.zext(rv1, ir.IntType(64)) +# rv2_64 = builder.zext(rv2, ir.IntType(64)) +# rv_shl = builder.shl(rv2_64, context.get_constant(types.i8, 32)) +# rv = builder.or_(rv_shl, rv1_64) +# if value_type == types.float64: +# rv = builder.bitcast(rv, ir.DoubleType()) +# ret = cgutils.make_anonymous_struct(builder, (rv, pred)) +# return ret +# @lower(stubs.vote_sync_intrinsic, types.i4, types.i4, types.boolean) diff --git a/numba_cuda/numba/cuda/device_init.py b/numba_cuda/numba/cuda/device_init.py index da8074754..24b741299 100644 --- a/numba_cuda/numba/cuda/device_init.py +++ b/numba_cuda/numba/cuda/device_init.py @@ -13,7 +13,7 @@ local, const, atomic, - shfl_sync_intrinsic, + #shfl_sync_intrinsic, vote_sync_intrinsic, match_any_sync, match_all_sync, @@ -40,6 +40,10 @@ syncthreads_and, syncthreads_count, syncthreads_or, + shfl_sync, + shfl_up_sync, + shfl_down_sync, + shfl_xor_sync, ) from .cudadrv.error import CudaSupportError from numba.cuda.cudadrv.driver import ( @@ -68,10 +72,6 @@ any_sync, eq_sync, ballot_sync, - shfl_sync, - shfl_up_sync, - shfl_down_sync, - shfl_xor_sync, ) from .kernels import reduction diff --git a/numba_cuda/numba/cuda/intrinsic_wrapper.py b/numba_cuda/numba/cuda/intrinsic_wrapper.py index cfbdf06fe..9ff85c300 100644 --- a/numba_cuda/numba/cuda/intrinsic_wrapper.py +++ b/numba_cuda/numba/cuda/intrinsic_wrapper.py @@ -38,40 +38,40 @@ def ballot_sync(mask, predicate): return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0] -@jit(device=True) -def shfl_sync(mask, value, src_lane): - """ - Shuffles value across the masked warp and returns the value - from src_lane. If this is outside the warp, then the - given value is returned. - """ - return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0] - - -@jit(device=True) -def shfl_up_sync(mask, value, delta): - """ - Shuffles value across the masked warp and returns the value - from (laneid - delta). If this is outside the warp, then the - given value is returned. - """ - return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0] - - -@jit(device=True) -def shfl_down_sync(mask, value, delta): - """ - Shuffles value across the masked warp and returns the value - from (laneid + delta). If this is outside the warp, then the - given value is returned. - """ - return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0] - - -@jit(device=True) -def shfl_xor_sync(mask, value, lane_mask): - """ - Shuffles value across the masked warp and returns the value - from (laneid ^ lane_mask). - """ - return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0] +# @jit(device=True) +# def shfl_sync(mask, value, src_lane): +# """ +# Shuffles value across the masked warp and returns the value +# from src_lane. If this is outside the warp, then the +# given value is returned. +# """ +# return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0] + + +# @jit(device=True) +# def shfl_up_sync(mask, value, delta): +# """ +# Shuffles value across the masked warp and returns the value +# from (laneid - delta). If this is outside the warp, then the +# given value is returned. +# """ +# return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0] +# +# +# @jit(device=True) +# def shfl_down_sync(mask, value, delta): +# """ +# Shuffles value across the masked warp and returns the value +# from (laneid + delta). If this is outside the warp, then the +# given value is returned. +# """ +# return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0] +# +# +# @jit(device=True) +# def shfl_xor_sync(mask, value, lane_mask): +# """ +# Shuffles value across the masked warp and returns the value +# from (laneid ^ lane_mask). +# """ +# return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0] diff --git a/numba_cuda/numba/cuda/intrinsics.py b/numba_cuda/numba/cuda/intrinsics.py index 2691ee8eb..505c2a504 100644 --- a/numba_cuda/numba/cuda/intrinsics.py +++ b/numba_cuda/numba/cuda/intrinsics.py @@ -205,3 +205,141 @@ def syncthreads_or(typingctx, predicate): @overload_method(types.Integer, "bit_count", target="cuda") def integer_bit_count(i): return lambda i: cuda.popc(i) + + +# ------------------------------------------------------------------------------- +# shfl_sync + + +@intrinsic +def shfl_sync(typingctx, mask, value, src_lane): + mode_value = 0 + clamp_value = 0x1F + return shfl_sync_intrinsic(typingctx, mask, mode_value, value, src_lane, + clamp_value) + +# @jit(device=True) +@intrinsic +def shfl_up_sync(typingctx, mask, value, delta): +# """ +# Shuffles value across the masked warp and returns the value +# from (laneid - delta). If this is outside the warp, then the +# given value is returned. +# """ + mode_value = 1 + clamp_value = 0 + return shfl_sync_intrinsic(typingctx, mask, mode_value, value, delta, + clamp_value) +# return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0] +# +# +# @jit(device=True) +@intrinsic +def shfl_down_sync(typingctx, mask, value, delta): +# """ +# Shuffles value across the masked warp and returns the value +# from (laneid + delta). If this is outside the warp, then the +# given value is returned. +# """ + mode_value = 2 + clamp_value = 0x1F + return shfl_sync_intrinsic(typingctx, mask, mode_value, value, delta, + clamp_value) +# return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0] +# +# +# @jit(device=True) +@intrinsic +def shfl_xor_sync(typingctx, mask, value, lane_mask): +# """ +# Shuffles value across the masked warp and returns the value +# from (laneid ^ lane_mask). +# """ + mode_value = 3 + clamp_value = 0x1F + return shfl_sync_intrinsic(typingctx, mask, mode_value, value, lane_mask, + clamp_value) +# return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0] + + +def shfl_sync_intrinsic(typingctx, mask, mode, value, src_lane, clamp): + mode_value = 0 + clamp_value = 0x1F + + if value not in (types.i4, types.i8, types.f8, types.f8): + # XXX: More general typing ? + raise TypingError("Only 32- and 64-bit ints and floats") + + if mask != types.int32 or src_lane != types.int32: + print(mask) + print(src_lane) + print("Warning: casting") + # raise TypingError("mask and value must be int32") + + sig = signature(types.int32, mask, value, src_lane) + + def codegen(context, builder, sig, args): + """ + The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic + function supports both 32 and 64 bit ints and floats, so for feature + parity, i64, f32, and f64 are implemented. Floats by way of bitcasting + the float to an int, then shuffling, then bitcasting back. And 64-bit + values by packing them into 2 32bit values, shuffling thoose, and then + packing back together.""" + mask, value, index = args + value_type = sig.args[2] + if value_type in types.real_domain: + value = builder.bitcast(value, ir.IntType(value_type.bitwidth)) + fname = "llvm.nvvm.shfl.sync.i32" + lmod = builder.module + fnty = ir.FunctionType( + ir.LiteralStructType((ir.IntType(32), ir.IntType(1))), + ( + ir.IntType(32), + ir.IntType(32), + ir.IntType(32), + ir.IntType(32), + ir.IntType(32), + ), + ) + + i32 = ir.IntType(32) + mode = ir.Constant(i32, mode_value) + clamp = ir.Constant(i32, clamp_value) + mask = builder.trunc(mask, i32) + index = builder.trunc(index, i32) + + func = cgutils.get_or_insert_function(lmod, fnty, fname) + if value_type.bitwidth == 32: + value = builder.trunc(value, i32) + ret = builder.call(func, (mask, mode, value, index, clamp)) + if value_type == types.float32: + rv = builder.extract_value(ret, 0) + pred = builder.extract_value(ret, 1) + fv = builder.bitcast(rv, ir.FloatType()) + ret = cgutils.make_anonymous_struct(builder, (fv, pred)) + else: + if value.type.width == 32: + value = builder.zext(value, ir.IntType(64)) + value1 = builder.trunc(value, ir.IntType(32)) + try: + value_lshr = builder.lshr(value, context.get_constant(types.i8, 32)) + except ValueError: + breakpoint() + value2 = builder.trunc(value_lshr, ir.IntType(32)) + ret1 = builder.call(func, (mask, mode, value1, index, clamp)) + ret2 = builder.call(func, (mask, mode, value2, index, clamp)) + rv1 = builder.extract_value(ret1, 0) + rv2 = builder.extract_value(ret2, 0) + pred = builder.extract_value(ret1, 1) + rv1_64 = builder.zext(rv1, ir.IntType(64)) + rv2_64 = builder.zext(rv2, ir.IntType(64)) + rv_shl = builder.shl(rv2_64, context.get_constant(types.i8, 32)) + rv = builder.or_(rv_shl, rv1_64) + if value_type == types.float64: + rv = builder.bitcast(rv, ir.DoubleType()) + ret = cgutils.make_anonymous_struct(builder, (rv, pred)) + return builder.extract_value(ret, 0) + + print(sig) + return sig, codegen diff --git a/numba_cuda/numba/cuda/stubs.py b/numba_cuda/numba/cuda/stubs.py index a16607699..ccd7ca7c2 100644 --- a/numba_cuda/numba/cuda/stubs.py +++ b/numba_cuda/numba/cuda/stubs.py @@ -185,15 +185,16 @@ class syncwarp(Stub): _description_ = "" -class shfl_sync_intrinsic(Stub): - """ - shfl_sync_intrinsic(mask, mode, value, mode_offset, clamp) - - Nvvm intrinsic for shuffling data across a warp - docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-datamove - """ - - _description_ = "" +# class shfl_sync_intrinsic(Stub): +# """ +# shfl_sync_intrinsic(mask, mode, value, mode_offset, clamp) +# +# Nvvm intrinsic for shuffling data across a warp +# docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-datamove +# """ +# +# _description_ = "" +# class vote_sync_intrinsic(Stub):