diff --git a/csrc/libtorch_stable/activation_kernels.cu b/csrc/libtorch_stable/activation_kernels.cu index cdab456348e2..1e23c70451c7 100644 --- a/csrc/libtorch_stable/activation_kernels.cu +++ b/csrc/libtorch_stable/activation_kernels.cu @@ -79,9 +79,11 @@ __global__ void act_and_mul_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., 2, d] const int d, const float limit) { - const scalar_t* x_ptr = input + blockIdx.x * 2 * d; + // Promote blockIdx.x to int64_t before multiplying by 2 * d (#42860). + const int64_t token_idx = blockIdx.x; + const scalar_t* x_ptr = input + token_idx * 2 * d; const scalar_t* y_ptr = x_ptr + d; - scalar_t* out_ptr = out + blockIdx.x * d; + scalar_t* out_ptr = out + token_idx * d; if constexpr (use_vec) { using cuda_t = typename CUDATypeConverter::Type; @@ -317,9 +319,10 @@ template ::Type; @@ -526,8 +529,9 @@ __global__ void activation_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., d] const int d) { - const scalar_t* in_ptr = input + blockIdx.x * d; - scalar_t* out_ptr = out + blockIdx.x * d; + const int64_t token_idx = blockIdx.x; + const scalar_t* in_ptr = input + token_idx * d; + scalar_t* out_ptr = out + token_idx * d; if constexpr (use_vec) { // Fast path: 128-bit/256-bit vectorized loop