CUDA: fix bug in topk-moe softmax (#16711)

am17an · web-flow · commit 9285325ce017 · 2025-10-22T12:33:08.000+08:00
diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu
@@ -141,7 +141,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
         wt_sum              = warp_reduce_sum(wt_sum);
         const float inv_sum = 1.0f / wt_sum;
 
-        for (int i = threadIdx.x; i < n_expert_used; i += WARP_SIZE) {
+        for (int i = 0; i < experts_per_thread; i++) {
             output_weights[i] *= inv_sum;
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -141,7 +141,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *`
`141`	`141`	`wt_sum = warp_reduce_sum(wt_sum);`
`142`	`142`	`const float inv_sum = 1.0f / wt_sum;`
`143`	`143`
`144`		`- for (int i = threadIdx.x; i < n_expert_used; i += WARP_SIZE) {`
	`144`	`+ for (int i = 0; i < experts_per_thread; i++) {`
`145`	`145`	`output_weights[i] *= inv_sum;`
`146`	`146`	`}`
`147`	`147`	`}`