diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 1a6b0f7dda..d7064b4d3d 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -287,6 +287,10 @@ __device__ float __shfl(float input, int lane, int width = warpSize); __device__ float __shfl_up(float input, unsigned int lane_delta, int width = warpSize); __device__ float __shfl_down(float input, unsigned int lane_delta, int width = warpSize); __device__ float __shfl_xor(float input, int lane_mask, int width = warpSize); +__device__ double __shfl(double input, int lane, int width = warpSize); +__device__ double __shfl_up(double input, unsigned int lane_delta, int width = warpSize); +__device__ double __shfl_down(double input, unsigned int lane_delta, int width = warpSize); +__device__ double __shfl_xor(double input, int lane_mask, int width = warpSize); #else __device__ int __shfl(int input, int lane, int width); __device__ int __shfl_up(int input, unsigned int lane_delta, int width); @@ -296,6 +300,10 @@ __device__ float __shfl(float input, int lane, int width); __device__ float __shfl_up(float input, unsigned int lane_delta, int width); __device__ float __shfl_down(float input, unsigned int lane_delta, int width); __device__ float __shfl_xor(float input, int lane_mask, int width); +__device__ double __shfl(double input, int lane, int width); +__device__ double __shfl_up(double input, unsigned int lane_delta, int width); +__device__ double __shfl_down(double input, unsigned int lane_delta, int width); +__device__ double __shfl_xor(double input, int lane_mask, int width); #endif //__cplusplus __device__ unsigned __hip_ds_bpermute(int index, unsigned src); diff --git a/src/device_util.cpp b/src/device_util.cpp index 830d70e741..8588c7331f 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -319,6 +319,20 @@ __device__ float __shfl_xor(float input, int lane_mask, int width) { return hc::__shfl_xor(input, lane_mask, width); } +__device__ double __shfl(double input, int lane, int width) { return hc::__shfl(input, lane, width); } + +__device__ double __shfl_up(double input, unsigned int lane_delta, int width) { + return hc::__shfl_up(input, lane_delta, width); +} + +__device__ double __shfl_down(double input, unsigned int lane_delta, int width) { + return hc::__shfl_down(input, lane_delta, width); +} + +__device__ double __shfl_xor(double input, int lane_mask, int width) { + return hc::__shfl_xor(input, lane_mask, width); +} + __host__ __device__ int min(int arg1, int arg2) { return (int)(hc::precise_math::fmin((float)arg1, (float)arg2)); } @@ -333,4 +347,4 @@ __device__ void* __get_dynamicgroupbaseptr() { __host__ void* __get_dynamicgroupbaseptr() { return nullptr; } -__device__ void __threadfence_system(void) { std::atomic_thread_fence(std::memory_order_seq_cst); } \ No newline at end of file +__device__ void __threadfence_system(void) { std::atomic_thread_fence(std::memory_order_seq_cst); }