Add runtime swap AB for SM100 FP8 blockwise GEMM

Barry-Delaney · Barry-Delaney · commit 49c176b3c126 · 2025-08-04T16:13:00.000+08:00
Signed-off-by: Barry Kang &lt;43644113+Barry-Delaney@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -574,14 +574,29 @@ def apply(self, module: Linear, input: torch.Tensor,
 
         if get_sm_version() == 100:
             import deep_gemm
-            a, a_sf = fp8_utils.per_token_quant_and_transform(input)
-            output = torch.empty((input.shape[0], module.weight.shape[0]),
-                                 device=input.device,
-                                 dtype=torch.bfloat16)
-            deep_gemm.fp8_gemm_nt((a, a_sf),
-                                  (module.weight, module.weight_scale),
-                                  output,
-                                  disable_ue8m0_cast=True)
+            if input.shape[0] < 128:
+                # Swap AB
+                a, a_sf = fp8_utils.per_token_quant_and_transform(input,
+                                                                  swap_ab=True)
+                output_padded = torch.empty(
+                    (module.weight.shape[0], a.shape[0]),
+                    device=input.device,
+                    dtype=torch.bfloat16)
+                deep_gemm.fp8_gemm_nt((module.weight, module.weight_scale),
+                                      (a, a_sf),
+                                      output_padded,
+                                      disable_ue8m0_cast=True)
+                output = fp8_utils.masked_transpose(output_padded,
+                                                    input.shape[0])
+            else:
+                a, a_sf = fp8_utils.per_token_quant_and_transform(input)
+                output = torch.empty((input.shape[0], module.weight.shape[0]),
+                                     device=input.device,
+                                     dtype=torch.bfloat16)
+                deep_gemm.fp8_gemm_nt((a, a_sf),
+                                      (module.weight, module.weight_scale),
+                                      output,
+                                      disable_ue8m0_cast=True)
         else:
             act_input_fp8, act_input_sf = torch.ops.trtllm.fp8_quantize_1x128(
                 input)
diff --git a/tensorrt_llm/quantization/utils/fp8_utils.py b/tensorrt_llm/quantization/utils/fp8_utils.py
@@ -336,7 +336,7 @@ def silu_and_mul_masked_post_quant_fwd(
     scale_k = ceil_div(k, quant_group_size)
     m_padded = align(m, alignment)
     scale_k_padded = align(scale_k, alignment)
-    output_scale = torch.zeros((g, scale_k_padded // 4, m_padded),
+    output_scale = torch.empty((g, scale_k_padded // 4, m_padded),
                                dtype=torch.int32,
                                device='cuda')
 
@@ -458,6 +458,7 @@ def per_token_quant_and_transform(
     input: torch.Tensor,
     quant_group_size: int = 128,
     scale_ue8m0: bool = True,
+    swap_ab=False,
 ):
     """
     input shape [g, m, k]
@@ -477,18 +478,21 @@ def per_token_quant_and_transform(
     fp8_min = -fp8_max
 
     m, k = input.shape
+    m_padded = m if not swap_ab else align(m, 8)
 
     # Create output
-    output = torch.empty((m, k), dtype=torch.float8_e4m3fn, device="cuda")
+    output = torch.empty((m_padded, k),
+                         dtype=torch.float8_e4m3fn,
+                         device=input.device)
 
     # Create output scale
     alignment = 4
     scale_k = ceil_div(k, quant_group_size)
-    m_padded = align(m, alignment)
+    m_aligned = align(m_padded, alignment)
     scale_k_padded = align(scale_k, alignment)
-    output_scale = torch.zeros((scale_k_padded // 4, m_padded),
+    output_scale = torch.empty((scale_k_padded // 4, m_aligned),
                                dtype=torch.int32,
-                               device='cuda')
+                               device=input.device)
 
     # Get block/grid/stage/warp
     BLOCK_NUM_PER_EXPERT = 64
@@ -518,13 +522,56 @@ def per_token_quant_and_transform(
         num_warps=num_warps,
         SCALE_UE8M0=scale_ue8m0,
     )
-    output_scale = output_scale.transpose(0, 1)[:m, :]
+    output_scale = output_scale.transpose(0, 1)[:m_padded, :]
     check_sf_layout(
         output_scale,
-        m,
+        m_padded,
         k,
         (1, 128),
         num_groups=None,
         tma_stride_check=True,
     )
     return output, output_scale
+
+
+@triton.jit
+def _transpose_kernel(input_ptr, output_ptr, M, N, stride_in_m, stride_in_n,
+                      stride_out_m, stride_out_n, BLOCK_SIZE: tl.constexpr):
+    row_block = tl.program_id(0)
+    col_block = tl.program_id(1)
+
+    row = row_block * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    col = col_block * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+
+    mask_row = row < M
+    mask_col = col < N
+    mask = mask_row[:, None] & mask_col[None, :]
+
+    input_idx = row[:, None] * stride_in_m + col[None, :] * stride_in_n
+    data = tl.load(input_ptr + input_idx, mask=mask, other=0)
+
+    output_idx = row[:, None] * stride_out_n + col[None, :] * stride_out_m
+    tl.store(output_ptr + output_idx, data, mask=mask)
+
+
+def masked_transpose(input: torch.Tensor, n_available: int) -> torch.Tensor:
+    M, N = input.shape
+    BLOCK_SIZE = 32
+    output = torch.empty((n_available, M),
+                         dtype=input.dtype,
+                         device=input.device)
+
+    grid = ((M + BLOCK_SIZE - 1) // BLOCK_SIZE,
+            (n_available + BLOCK_SIZE - 1) // BLOCK_SIZE)
+    _transpose_kernel[grid](
+        input,
+        output,
+        M,
+        n_available,
+        input.stride(0),
+        input.stride(1),
+        output.stride(0),
+        output.stride(1),
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    return output