Address comments

wenscarl · wenscarl · commit 6b55e4dabc99 · 2025-10-21T02:39:42.000Z
Signed-off-by: Shu Wang. &lt;shuw@nvidia.com&gt;
diff --git a/csrc/nv_internal/cpp/kernels/quantization.cu b/csrc/nv_internal/cpp/kernels/quantization.cu
@@ -319,7 +319,7 @@ void invokeSiluAndMulNVFP4Quantization(void* output, void* output_scale, void* i
 
   // TODO(kaixih@nvidia): Should relax this to allow any grid size.
   // shuw@nvidia.com: only deal with mask case
-  assert(mask != nullptr);
+  TLLM_CHECK_WITH_INFO(mask != nullptr, "mask must be non-null for expert NVFP4 path");
   grid.x = (grid.x + n_experts - 1) / n_experts * n_experts;
   cvt_fp16_to_fp4_expert<T, false><<<grid, block, 0, stream>>>(
       m_topk, k, reinterpret_cast<T*>(input), reinterpret_cast<float*>(input_global_scale),
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
@@ -159,7 +159,7 @@ void fp4_batched_quantize(Tensor self, Tensor globalScale, Tensor valueE2M1, Ten
   tensorrt_llm::kernels::invokeFP4Quantization<T, SF_VEC_SIZE>(                                  \
       b, m, k, reinterpret_cast<T*>(self->data), static_cast<float*>(globalScale->data),         \
       reinterpret_cast<int64_t*>(valueE2M1->data), reinterpret_cast<int32_t*>(scaleFP8SF->data), \
-      sfUseUE8M0, layout, mMultiProcessorCount, get_stream(self->device));
+      sfUseUE8M0, layout, mMultiProcessorCount, /*enable_pdl=*/false, get_stream(self->device));
 
   if (self->dtype == dl_float16) {
     LAUNCH_FP4_QUANTIZE_KERNEL(half, 16)
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
@@ -362,10 +362,13 @@ def _fake_fp4_batched_quantize_sm100(
         sf_vec_size: int = 16,
         sf_use_ue8m0: bool = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        m, k = input.shape
+        b, m, k = input.shape
         return (
-            input.new_empty([m, k // 2], dtype=torch.int64),  # float4_e2m1_x2
-            input.new_empty([m * k // sf_vec_size], dtype=torch.int32),  # Scale factors
+            input.new_empty([b, m, k // 2], dtype=torch.uint8),  # FLOAT4_E2M1X2
+            input.new_empty(
+                [b, _compute_swizzled_layout_sf_size(m, k // sf_vec_size, 128)],
+                dtype=torch.uint8,
+            ),  # swizzled SF buffer
         )
 
     @register_custom_op(