vllm-project · yewentao256 · Jun 1, 2026 · May 26, 2026 · May 27, 2026 · May 28, 2026
diff --git a/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py b/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py
@@ -312,22 +312,22 @@ def apply_block_scaled_mm(
     ) -> torch.Tensor:
         out_dtype = self.config.out_dtype
         if self.is_hopper:
-            return torch.ops.vllm.padded_cutlass(
+            return torch.ops.vllm.dynamic_padded_cutlass(
                 A,
                 B,
                 As,
                 Bs,
                 list(self.weight_group_shape),
                 out_dtype,
             )
-        else:
-            return ops.cutlass_scaled_mm(
-                A,
-                B.T,
-                out_dtype=out_dtype,
-                scale_a=As,
-                scale_b=Bs.T,
-            )
+
+        return ops.cutlass_scaled_mm(
+            A,
+            B.T,
+            out_dtype=out_dtype,
+            scale_a=As,
+            scale_b=Bs.T,
+        )
 
 
 def cutlass_scaled_mm(
@@ -397,8 +397,56 @@ def _padded_cutlass_fake(
     )
 
 
+def _dynamic_padded_cutlass(
+    qx: torch.Tensor,
+    weight: torch.Tensor,
+    x_scale: torch.Tensor,
+    weight_scale: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype,
+) -> torch.Tensor:
+    def run_padded(
+        qx: torch.Tensor,
+        weight: torch.Tensor,
+        x_scale: torch.Tensor,
+        weight_scale: torch.Tensor,
+    ) -> torch.Tensor:
+        return _padded_cutlass(
+            qx, weight, x_scale, weight_scale, block_size, output_dtype
+        )
+
+    def run_direct(
+        qx: torch.Tensor,
+        weight: torch.Tensor,
+        x_scale: torch.Tensor,
+        weight_scale: torch.Tensor,
+    ) -> torch.Tensor:
+        return cutlass_scaled_mm(
+            qx, weight, x_scale, weight_scale, block_size, output_dtype
+        )
+
+    if torch.compiler.is_compiling():
+        return torch.cond(
+            qx.shape[0] % 4 != 0,
+            run_padded,
+            run_direct,
+            (qx, weight, x_scale, weight_scale),
+        )
+
+    if qx.shape[0] % 4 != 0:
+        return run_padded(qx, weight, x_scale, weight_scale)
+
+    return run_direct(qx, weight, x_scale, weight_scale)
+
+
 direct_register_custom_op(
     "padded_cutlass",
     _padded_cutlass,
     fake_impl=_padded_cutlass_fake,
 )
+
+direct_register_custom_op(
+    "dynamic_padded_cutlass",
+    _dynamic_padded_cutlass,
+    fake_impl=_padded_cutlass_fake,
+)