Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 39 additions & 39 deletions numba_cuda/numba/cuda/cudadecl.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,43 +100,43 @@ class Cuda_syncwarp(ConcreteTemplate):
cases = [signature(types.none), signature(types.none, types.i4)]


@register
class Cuda_shfl_sync_intrinsic(ConcreteTemplate):
key = cuda.shfl_sync_intrinsic
cases = [
signature(
types.Tuple((types.i4, types.b1)),
types.i4,
types.i4,
types.i4,
types.i4,
types.i4,
),
signature(
types.Tuple((types.i8, types.b1)),
types.i4,
types.i4,
types.i8,
types.i4,
types.i4,
),
signature(
types.Tuple((types.f4, types.b1)),
types.i4,
types.i4,
types.f4,
types.i4,
types.i4,
),
signature(
types.Tuple((types.f8, types.b1)),
types.i4,
types.i4,
types.f8,
types.i4,
types.i4,
),
]
# @register
# class Cuda_shfl_sync_intrinsic(ConcreteTemplate):
# key = cuda.shfl_sync_intrinsic
# cases = [
# signature(
# types.Tuple((types.i4, types.b1)),
# types.i4,
# types.i4,
# types.i4,
# types.i4,
# types.i4,
# ),
# signature(
# types.Tuple((types.i8, types.b1)),
# types.i4,
# types.i4,
# types.i8,
# types.i4,
# types.i4,
# ),
# signature(
# types.Tuple((types.f4, types.b1)),
# types.i4,
# types.i4,
# types.f4,
# types.i4,
# types.i4,
# ),
# signature(
# types.Tuple((types.f8, types.b1)),
# types.i4,
# types.i4,
# types.f8,
# types.i4,
# types.i4,
# ),
# ]


@register
Expand Down Expand Up @@ -815,8 +815,8 @@ def resolve_threadfence_system(self, mod):
def resolve_syncwarp(self, mod):
return types.Function(Cuda_syncwarp)

def resolve_shfl_sync_intrinsic(self, mod):
return types.Function(Cuda_shfl_sync_intrinsic)
# def resolve_shfl_sync_intrinsic(self, mod):
# return types.Function(Cuda_shfl_sync_intrinsic)

def resolve_vote_sync_intrinsic(self, mod):
return types.Function(Cuda_vote_sync_intrinsic)
Expand Down
123 changes: 62 additions & 61 deletions numba_cuda/numba/cuda/cudaimpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,67 +204,68 @@ def ptx_syncwarp_mask(context, builder, sig, args):
return context.get_dummy_value()


@lower(
stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4, types.i4
)
@lower(
stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4, types.i4
)
@lower(
stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4, types.i4
)
@lower(
stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4, types.i4
)
def ptx_shfl_sync_i32(context, builder, sig, args):
"""
The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
function supports both 32 and 64 bit ints and floats, so for feature parity,
i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
an int, then shuffling, then bitcasting back. And 64-bit values by packing
them into 2 32bit values, shuffling thoose, and then packing back together.
"""
mask, mode, value, index, clamp = args
value_type = sig.args[2]
if value_type in types.real_domain:
value = builder.bitcast(value, ir.IntType(value_type.bitwidth))
fname = "llvm.nvvm.shfl.sync.i32"
lmod = builder.module
fnty = ir.FunctionType(
ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
(
ir.IntType(32),
ir.IntType(32),
ir.IntType(32),
ir.IntType(32),
ir.IntType(32),
),
)
func = cgutils.get_or_insert_function(lmod, fnty, fname)
if value_type.bitwidth == 32:
ret = builder.call(func, (mask, mode, value, index, clamp))
if value_type == types.float32:
rv = builder.extract_value(ret, 0)
pred = builder.extract_value(ret, 1)
fv = builder.bitcast(rv, ir.FloatType())
ret = cgutils.make_anonymous_struct(builder, (fv, pred))
else:
value1 = builder.trunc(value, ir.IntType(32))
value_lshr = builder.lshr(value, context.get_constant(types.i8, 32))
value2 = builder.trunc(value_lshr, ir.IntType(32))
ret1 = builder.call(func, (mask, mode, value1, index, clamp))
ret2 = builder.call(func, (mask, mode, value2, index, clamp))
rv1 = builder.extract_value(ret1, 0)
rv2 = builder.extract_value(ret2, 0)
pred = builder.extract_value(ret1, 1)
rv1_64 = builder.zext(rv1, ir.IntType(64))
rv2_64 = builder.zext(rv2, ir.IntType(64))
rv_shl = builder.shl(rv2_64, context.get_constant(types.i8, 32))
rv = builder.or_(rv_shl, rv1_64)
if value_type == types.float64:
rv = builder.bitcast(rv, ir.DoubleType())
ret = cgutils.make_anonymous_struct(builder, (rv, pred))
return ret
# @lower(
# stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4, types.i4
# )
# @lower(
# stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4, types.i4
# )
# @lower(
# stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4, types.i4
# )
# @lower(
# stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4, types.i4
# )
# def ptx_shfl_sync_i32(context, builder, sig, args):
# """
# The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
# function supports both 32 and 64 bit ints and floats, so for feature parity,
# i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
# an int, then shuffling, then bitcasting back. And 64-bit values by packing
# them into 2 32bit values, shuffling thoose, and then packing back together.
# """
# mask, mode, value, index, clamp = args
# value_type = sig.args[2]
# if value_type in types.real_domain:
# value = builder.bitcast(value, ir.IntType(value_type.bitwidth))
# fname = "llvm.nvvm.shfl.sync.i32"
# lmod = builder.module
# fnty = ir.FunctionType(
# ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
# (
# ir.IntType(32),
# ir.IntType(32),
# ir.IntType(32),
# ir.IntType(32),
# ir.IntType(32),
# ),
# )
# func = cgutils.get_or_insert_function(lmod, fnty, fname)
# if value_type.bitwidth == 32:
# ret = builder.call(func, (mask, mode, value, index, clamp))
# if value_type == types.float32:
# rv = builder.extract_value(ret, 0)
# pred = builder.extract_value(ret, 1)
# fv = builder.bitcast(rv, ir.FloatType())
# ret = cgutils.make_anonymous_struct(builder, (fv, pred))
# else:
# value1 = builder.trunc(value, ir.IntType(32))
# value_lshr = builder.lshr(value, context.get_constant(types.i8, 32))
# value2 = builder.trunc(value_lshr, ir.IntType(32))
# ret1 = builder.call(func, (mask, mode, value1, index, clamp))
# ret2 = builder.call(func, (mask, mode, value2, index, clamp))
# rv1 = builder.extract_value(ret1, 0)
# rv2 = builder.extract_value(ret2, 0)
# pred = builder.extract_value(ret1, 1)
# rv1_64 = builder.zext(rv1, ir.IntType(64))
# rv2_64 = builder.zext(rv2, ir.IntType(64))
# rv_shl = builder.shl(rv2_64, context.get_constant(types.i8, 32))
# rv = builder.or_(rv_shl, rv1_64)
# if value_type == types.float64:
# rv = builder.bitcast(rv, ir.DoubleType())
# ret = cgutils.make_anonymous_struct(builder, (rv, pred))
# return ret
#


@lower(stubs.vote_sync_intrinsic, types.i4, types.i4, types.boolean)
Expand Down
10 changes: 5 additions & 5 deletions numba_cuda/numba/cuda/device_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
local,
const,
atomic,
shfl_sync_intrinsic,
#shfl_sync_intrinsic,
vote_sync_intrinsic,
match_any_sync,
match_all_sync,
Expand All @@ -40,6 +40,10 @@
syncthreads_and,
syncthreads_count,
syncthreads_or,
shfl_sync,
shfl_up_sync,
shfl_down_sync,
shfl_xor_sync,
)
from .cudadrv.error import CudaSupportError
from numba.cuda.cudadrv.driver import (
Expand Down Expand Up @@ -68,10 +72,6 @@
any_sync,
eq_sync,
ballot_sync,
shfl_sync,
shfl_up_sync,
shfl_down_sync,
shfl_xor_sync,
)

from .kernels import reduction
Expand Down
74 changes: 37 additions & 37 deletions numba_cuda/numba/cuda/intrinsic_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,40 +38,40 @@ def ballot_sync(mask, predicate):
return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]


@jit(device=True)
def shfl_sync(mask, value, src_lane):
"""
Shuffles value across the masked warp and returns the value
from src_lane. If this is outside the warp, then the
given value is returned.
"""
return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0]


@jit(device=True)
def shfl_up_sync(mask, value, delta):
"""
Shuffles value across the masked warp and returns the value
from (laneid - delta). If this is outside the warp, then the
given value is returned.
"""
return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]


@jit(device=True)
def shfl_down_sync(mask, value, delta):
"""
Shuffles value across the masked warp and returns the value
from (laneid + delta). If this is outside the warp, then the
given value is returned.
"""
return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0]


@jit(device=True)
def shfl_xor_sync(mask, value, lane_mask):
"""
Shuffles value across the masked warp and returns the value
from (laneid ^ lane_mask).
"""
return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0]
# @jit(device=True)
# def shfl_sync(mask, value, src_lane):
# """
# Shuffles value across the masked warp and returns the value
# from src_lane. If this is outside the warp, then the
# given value is returned.
# """
# return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0]


# @jit(device=True)
# def shfl_up_sync(mask, value, delta):
# """
# Shuffles value across the masked warp and returns the value
# from (laneid - delta). If this is outside the warp, then the
# given value is returned.
# """
# return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
#
#
# @jit(device=True)
# def shfl_down_sync(mask, value, delta):
# """
# Shuffles value across the masked warp and returns the value
# from (laneid + delta). If this is outside the warp, then the
# given value is returned.
# """
# return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0]
#
#
# @jit(device=True)
# def shfl_xor_sync(mask, value, lane_mask):
# """
# Shuffles value across the masked warp and returns the value
# from (laneid ^ lane_mask).
# """
# return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0]
Loading
Loading