We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 5ec8b20 commit 37b55f6Copy full SHA for 37b55f6
src/tl_templates/cuda/reduce.h
@@ -73,7 +73,7 @@ struct SharedReduceWarp {
73
74
unsigned mask = __activemask();
75
for (int offset = kWarpSize / 2; offset > 0; offset >>= 1) {
76
- T other = __shfl_down_sync(mask, partial, offset);
+ T other = tl::shfl_down_sync(mask, partial, offset);
77
partial = Reducer()(partial, other);
78
}
79
@@ -159,7 +159,7 @@ template <int threads, bool reverse = false> struct CumSum1D {
159
160
#pragma unroll
161
for (int off = 1; off < SEG; off <<= 1) {
162
- T n = (T)__shfl_down_sync(MASK, val, off);
+ T n = (T)tl::shfl_down_sync(MASK, val, off);
163
if (lane < SEG - off)
164
val += n;
165
0 commit comments