Improve

wenscarl · wenscarl · commit 48259af173f1 · 2025-10-22T08:49:48.000Z
Signed-off-by: Shu Wang. &lt;shuw@nvidia.com&gt;
diff --git a/csrc/nv_internal/cpp/kernels/quantization.cu b/csrc/nv_internal/cpp/kernels/quantization.cu
@@ -299,13 +299,15 @@ void invokeSiluAndMulNVFP4Quantization(void* output, void* output_scale, void* i
                                        void* input_global_scale, void* mask, bool use_silu_and_mul,
                                        int m_topk, int k, int n_experts, cudaStream_t stream) {
   int device;
-  cudaGetDevice(&device);
+  TLLM_CUDA_CHECK(cudaGetDevice(&device));
   int multiProcessorCount;
-  cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, device);
+  TLLM_CUDA_CHECK(
+      cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, device));
 
   // Grid, Block size.
   // Each thread converts 8 values.
-  int const workSizePerRow = k / CVT_ELTS_PER_THREAD;
+  TLLM_CHECK_WITH_INFO(k > 0, "k must be > 0");
+  int const workSizePerRow = max(1, k / CVT_ELTS_PER_THREAD);
   int const totalWorkSize = m_topk * workSizePerRow;
   dim3 block(std::min(workSizePerRow, 512));
   // Get number of blocks per SM (assume we can fully utilize the SM).
@@ -320,6 +322,7 @@ void invokeSiluAndMulNVFP4Quantization(void* output, void* output_scale, void* i
   // TODO(kaixih@nvidia): Should relax this to allow any grid size.
   // shuw@nvidia.com: only deal with mask case
   TLLM_CHECK_WITH_INFO(mask != nullptr, "mask must be non-null for expert NVFP4 path");
+  TLLM_CHECK_WITH_INFO(n_experts > 0, "n_experts must be > 0");
   grid.x = (grid.x + n_experts - 1) / n_experts * n_experts;
   cvt_fp16_to_fp4_expert<T, false><<<grid, block, 0, stream>>>(
       m_topk, k, reinterpret_cast<T*>(input), reinterpret_cast<float*>(input_global_scale),
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
@@ -196,6 +196,11 @@ void silu_and_mul_scaled_nvfp4_experts_quantize(Tensor output, Tensor output_sca
   CHECK_CUDA(input);
   CHECK_CUDA(input_global_scale);
   CHECK_CUDA(mask);
+  CHECK_CONTIGUOUS(output);
+  CHECK_CONTIGUOUS(output_scale);
+  CHECK_CONTIGUOUS(input);
+  CHECK_CONTIGUOUS(input_global_scale);
+  CHECK_CONTIGUOUS(mask);
 
   TVM_FFI_ICHECK_EQ(mask.ndim(), 1);
   TVM_FFI_ICHECK_EQ(output.ndim(), 2);
@@ -210,14 +215,15 @@ void silu_and_mul_scaled_nvfp4_experts_quantize(Tensor output, Tensor output_sca
   CHECK_INPUT_TYPE(output, uint8_dtype);
   CHECK_INPUT_TYPE(output_scale, int32_dtype);
 
-  const int BLOCK_SIZE = 16;
+  constexpr int BLOCK_SIZE = 16;
   auto m_topk = input.shape()[0];
   auto k_by_2 = input.shape()[1];
   auto k = k_by_2;
   if (use_silu_and_mul) {
     TVM_FFI_ICHECK_EQ(k_by_2 % 2, 0) << "k must be a multiple of 2";
     k = k_by_2 / 2;
   }
+  TVM_FFI_ICHECK_EQ(k % BLOCK_SIZE, 0) << "k must be a multiple of 16";
   auto n_experts = input_global_scale.shape()[0];
   TVM_FFI_ICHECK_EQ(mask.shape()[0], n_experts);
   TVM_FFI_ICHECK_EQ(output.shape()[0], m_topk);