fix data type (tile-ai#1204)

LJC00118 · web-flow · commit 37b55f693c46 · 2025-11-07T12:33:55.000+08:00
diff --git a/src/tl_templates/cuda/reduce.h b/src/tl_templates/cuda/reduce.h
@@ -73,7 +73,7 @@ struct SharedReduceWarp {
 
       unsigned mask = __activemask();
       for (int offset = kWarpSize / 2; offset > 0; offset >>= 1) {
-        T other = __shfl_down_sync(mask, partial, offset);
+        T other = tl::shfl_down_sync(mask, partial, offset);
         partial = Reducer()(partial, other);
       }
 
@@ -159,7 +159,7 @@ template <int threads, bool reverse = false> struct CumSum1D {
 
 #pragma unroll
         for (int off = 1; off < SEG; off <<= 1) {
-          T n = (T)__shfl_down_sync(MASK, val, off);
+          T n = (T)tl::shfl_down_sync(MASK, val, off);
           if (lane < SEG - off)
             val += n;
         }

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ struct SharedReduceWarp {`
`73`	`73`
`74`	`74`	`unsigned mask = __activemask();`
`75`	`75`	`for (int offset = kWarpSize / 2; offset > 0; offset >>= 1) {`
`76`		`- T other = __shfl_down_sync(mask, partial, offset);`
	`76`	`+ T other = tl::shfl_down_sync(mask, partial, offset);`
`77`	`77`	`partial = Reducer()(partial, other);`
`78`	`78`	`}`
`79`	`79`
`@@ -159,7 +159,7 @@ template <int threads, bool reverse = false> struct CumSum1D {`
`159`	`159`
`160`	`160`	`#pragma unroll`
`161`	`161`	`for (int off = 1; off < SEG; off <<= 1) {`
`162`		`- T n = (T)__shfl_down_sync(MASK, val, off);`
	`162`	`+ T n = (T)tl::shfl_down_sync(MASK, val, off);`
`163`	`163`	`if (lane < SEG - off)`
`164`	`164`	`val += n;`
`165`	`165`	`}`