flashinfer-ai · yzh119 · Oct 1, 2025 · Oct 1, 2025 · gemini-code-assist · Oct 1, 2025
@@ -795,7 +795,6 @@ __device__ inline void quantize_with_block_size_impl(int32_t numbatches, int32_t
   static constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / ELTS_PER_THREAD;
   static_assert(sizeof(PackedVec) == sizeof(Type) * ELTS_PER_THREAD, "Vec size is not matched.");
 
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
   bool isSfSwizzledLayout = layout == QuantizationSFLayout::SWIZZLED_128x4 ||
                             layout == QuantizationSFLayout::SWIZZLED_8x4;
   int rowTile = (layout == QuantizationSFLayout::SWIZZLED_128x4) ? 128 : 8;
@@ -810,6 +809,7 @@ __device__ inline void quantize_with_block_size_impl(int32_t numbatches, int32_t
   asm volatile("griddepcontrol.wait;");
   for (int rowIdx = blockIdx.x; rowIdx < numPaddedRowsForSf; rowIdx += gridDim.x) {
     for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) {
+      float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[batchIdx];
       for (int colIdx = threadIdx.x; colIdx < numColThreadsForSf; colIdx += blockDim.x) {
         std::optional<int> optionalBatchIdx = batchIdx;
         std::optional<int> optionalNumRows = numRows;

diff --git a/tests/utils/test_fp4_quantize.py b/tests/utils/test_fp4_quantize.py
@@ -18,7 +18,7 @@
 DTYPES = [torch.float16, torch.bfloat16]
 # The batch dimension doesn't need to be multiple of 128
 SHAPES = [(128, 64), (256, 128), (120, 64), (200, 256)]
-BATCH_SHAPES = [(2, 128, 64), (3, 256, 128), (1, 120, 64)]
+BATCH_SHAPES = [(1, 256, 128), (2, 128, 64), (3, 256, 128), (1, 120, 64)]
 SEEDS = [42]
 CUDA_DEVICES = ["cuda:0"]
 
@@ -334,7 +334,7 @@ def test_nvfp4_batched_quantize(
 
     b, m, n = batch_shape
     x = torch.randn(batch_shape, dtype=dtype)
-    tensor_amax = torch.abs(x).max().to(torch.float32)
+    tensor_amax = torch.abs(x).amax(dim=(1, 2)).to(torch.float32)
     global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
     mask = None
     # Test the batched quantization
@@ -357,7 +357,7 @@ def test_nvfp4_batched_quantize(
 
     # Compare with single tensor quantization for each batch
     for i in range(b):
-        single_out, single_scale = fp4_quantize(x[i], global_scale, 16, False, True)
+        single_out, single_scale = fp4_quantize(x[i], global_scale[i], 16, False, True)
         if use_mask:
             torch.testing.assert_close(
                 out[i][: mask[i]], single_out[: mask[i]], rtol=1e-5, atol=1e-5
@@ -414,7 +414,7 @@ def test_silu_and_mul_nvfp4_batched_quantize(
     for i in range(b):
         x_silu_mul = silu_and_mul(x[i])
         single_out, single_scale = fp4_quantize(
-            x_silu_mul, global_scale, 16, False, True
+            x_silu_mul, global_scale[i], 16, False, True
         )
         torch.testing.assert_close(
             out[i][: mask[i]], single_out[: mask[i]], rtol=1e-5, atol=1e-5