Skip to content

Commit

Permalink
minor fixes in cuda device lib
Browse files Browse the repository at this point in the history
  • Loading branch information
Mike-Leo-Smith committed Dec 29, 2024
1 parent a7fcaaa commit fce6d8f
Show file tree
Hide file tree
Showing 4 changed files with 44,062 additions and 44,359 deletions.
80 changes: 0 additions & 80 deletions src/backends/cuda/cuda_builtin/cuda_device_half.h
Original file line number Diff line number Diff line change
Expand Up @@ -1237,66 +1237,6 @@ __device__ __half2 __hmin2(const __half2 a, const __half2 b) {
return val;
}
}
__device__ __half2 __shfl(const __half2 var, const int delta, const int width) {
unsigned int warp_size;
asm("{mov.u32 %0, WARP_SZ;\n}"
: "=r"(warp_size));
const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
{
__half2 r;
asm volatile("{"
"shfl.idx.b32"
" %0,%1,%2,%3;\n}"
: "=r"(*(reinterpret_cast<unsigned int *>(&(r))))
: "r"(*(reinterpret_cast<const unsigned int *>(&(var)))), "r"(delta), "r"(c));
return r;
}
}
__device__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width) {
unsigned int warp_size;
asm("{mov.u32 %0, WARP_SZ;\n}"
: "=r"(warp_size));
const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
{
__half2 r;
asm volatile("{"
"shfl.up.b32"
" %0,%1,%2,%3;\n}"
: "=r"(*(reinterpret_cast<unsigned int *>(&(r))))
: "r"(*(reinterpret_cast<const unsigned int *>(&(var)))), "r"(delta), "r"(c));
return r;
}
}
__device__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width) {
unsigned int warp_size;
asm("{mov.u32 %0, WARP_SZ;\n}"
: "=r"(warp_size));
const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
{
__half2 r;
asm volatile("{"
"shfl.down.b32"
" %0,%1,%2,%3;\n}"
: "=r"(*(reinterpret_cast<unsigned int *>(&(r))))
: "r"(*(reinterpret_cast<const unsigned int *>(&(var)))), "r"(delta), "r"(c));
return r;
}
}
__device__ __half2 __shfl_xor(const __half2 var, const int delta, const int width) {
unsigned int warp_size;
asm("{mov.u32 %0, WARP_SZ;\n}"
: "=r"(warp_size));
const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
{
__half2 r;
asm volatile("{"
"shfl.bfly.b32"
" %0,%1,%2,%3;\n}"
: "=r"(*(reinterpret_cast<unsigned int *>(&(r))))
: "r"(*(reinterpret_cast<const unsigned int *>(&(var)))), "r"(delta), "r"(c));
return r;
}
}
__device__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width) {
unsigned int warp_size;
asm("{mov.u32 %0, WARP_SZ;\n}"
Expand Down Expand Up @@ -1357,26 +1297,6 @@ __device__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const
return r;
}
}
__device__ __half __shfl(const __half var, const int delta, const int width) {
const __half2 temp1 = __halves2half2(var, var);
const __half2 temp2 = __shfl(temp1, delta, width);
return __low2half(temp2);
}
__device__ __half __shfl_up(const __half var, const unsigned int delta, const int width) {
const __half2 temp1 = __halves2half2(var, var);
const __half2 temp2 = __shfl_up(temp1, delta, width);
return __low2half(temp2);
}
__device__ __half __shfl_down(const __half var, const unsigned int delta, const int width) {
const __half2 temp1 = __halves2half2(var, var);
const __half2 temp2 = __shfl_down(temp1, delta, width);
return __low2half(temp2);
}
__device__ __half __shfl_xor(const __half var, const int delta, const int width) {
const __half2 temp1 = __halves2half2(var, var);
const __half2 temp2 = __shfl_xor(temp1, delta, width);
return __low2half(temp2);
}
__device__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width) {
const __half2 temp1 = __halves2half2(var, var);
const __half2 temp2 = __shfl_sync(mask, temp1, delta, width);
Expand Down
7 changes: 1 addition & 6 deletions src/backends/cuda/cuda_builtin/cuda_device_resource.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#pragma once

[[nodiscard]] __device__ constexpr auto lc_infinity_half() noexcept { return __ushort_as_half(static_cast<unsigned short>(0x7c00u)); }
[[nodiscard]] __device__ constexpr auto lc_infinity_float() noexcept { return __int_as_float(0x7f800000u); }
[[nodiscard]] __device__ constexpr auto lc_infinity_double() noexcept { return __longlong_as_double(0x7ff0000000000000ull); }
Expand Down Expand Up @@ -1072,9 +1070,6 @@ template<typename T>
lc_assume(__isGlobal(array.slots));
auto buffer = static_cast<const T *>(array.slots[index].buffer);
lc_assume(__isGlobal(buffer));
#ifdef LUISA_DEBUG
lc_check_in_bounds(i, lc_bindless_buffer_size<T>(array, index));
#endif
return buffer;
}

Expand All @@ -1090,7 +1085,7 @@ template<typename T>
}

template<typename T>
[[nodiscard]] __device__ void lc_bindless_buffer_write(LCBindlessArray array, lc_uint index, lc_ulong i, T value) noexcept {
__device__ void lc_bindless_buffer_write(LCBindlessArray array, lc_uint index, lc_ulong i, T value) noexcept {
lc_assume(__isGlobal(array.slots));
auto buffer = static_cast<T *>(array.slots[index].buffer);
lc_assume(__isGlobal(buffer));
Expand Down
Loading

0 comments on commit fce6d8f

Please sign in to comment.