@@ -299,13 +299,15 @@ void invokeSiluAndMulNVFP4Quantization(void* output, void* output_scale, void* i
299299 void * input_global_scale, void * mask, bool use_silu_and_mul,
300300 int m_topk, int k, int n_experts, cudaStream_t stream) {
301301 int device;
302- cudaGetDevice (&device);
302+ TLLM_CUDA_CHECK ( cudaGetDevice (&device) );
303303 int multiProcessorCount;
304- cudaDeviceGetAttribute (&multiProcessorCount, cudaDevAttrMultiProcessorCount, device);
304+ TLLM_CUDA_CHECK (
305+ cudaDeviceGetAttribute (&multiProcessorCount, cudaDevAttrMultiProcessorCount, device));
305306
306307 // Grid, Block size.
307308 // Each thread converts 8 values.
308- int const workSizePerRow = k / CVT_ELTS_PER_THREAD;
309+ TLLM_CHECK_WITH_INFO (k > 0 , " k must be > 0" );
310+ int const workSizePerRow = max (1 , k / CVT_ELTS_PER_THREAD);
309311 int const totalWorkSize = m_topk * workSizePerRow;
310312 dim3 block (std::min (workSizePerRow, 512 ));
311313 // Get number of blocks per SM (assume we can fully utilize the SM).
@@ -320,6 +322,7 @@ void invokeSiluAndMulNVFP4Quantization(void* output, void* output_scale, void* i
320322 // TODO(kaixih@nvidia): Should relax this to allow any grid size.
321323 // [email protected] : only deal with mask case 322324 TLLM_CHECK_WITH_INFO (mask != nullptr , " mask must be non-null for expert NVFP4 path" );
325+ TLLM_CHECK_WITH_INFO (n_experts > 0 , " n_experts must be > 0" );
323326 grid.x = (grid.x + n_experts - 1 ) / n_experts * n_experts;
324327 cvt_fp16_to_fp4_expert<T, false ><<<grid, block, 0 , stream>>> (
325328 m_topk, k, reinterpret_cast <T*>(input), reinterpret_cast <float *>(input_global_scale),
0 commit comments