From 3618f951254e895721677091410d39f7f6622cdc Mon Sep 17 00:00:00 2001
From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
Date: Thu, 5 Jun 2025 16:12:34 +0800
Subject: [PATCH 1/3] Update fp8_utils.py

---
 python/sglang/srt/layers/quantization/fp8_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py
index c180c0a7799..cfba5364910 100644
--- a/python/sglang/srt/layers/quantization/fp8_utils.py
+++ b/python/sglang/srt/layers/quantization/fp8_utils.py
@@ -227,8 +227,9 @@ def deepgemm_w8a8_block_fp8_linear_with_fallback(
     output_dtype = input.dtype
     dtype_supported = output_dtype == torch.bfloat16
 
-    # TODO: add more robust shape check here
-    shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
+    # TODO: design shape check here
+    # shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
+    shape_supported = True
 
     if not (shape_supported and dtype_supported):
         # fall back to triton

From 268b267ab613635b8928aff3e14b1a21725edea6 Mon Sep 17 00:00:00 2001
From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
Date: Thu, 5 Jun 2025 17:51:09 +0800
Subject: [PATCH 2/3] Update fp8_utils.py

---
 python/sglang/srt/layers/quantization/fp8_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py
index cfba5364910..67efe322384 100644
--- a/python/sglang/srt/layers/quantization/fp8_utils.py
+++ b/python/sglang/srt/layers/quantization/fp8_utils.py
@@ -228,8 +228,7 @@ def deepgemm_w8a8_block_fp8_linear_with_fallback(
     dtype_supported = output_dtype == torch.bfloat16
 
     # TODO: design shape check here
-    # shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
-    shape_supported = True
+    shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0
 
     if not (shape_supported and dtype_supported):
         # fall back to triton

From 74508e992d9ade0e73fb2a67990dc3e7402d1867 Mon Sep 17 00:00:00 2001
From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
Date: Thu, 5 Jun 2025 17:51:46 +0800
Subject: [PATCH 3/3] Update fp8_utils.py

---
 python/sglang/srt/layers/quantization/fp8_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py
index 67efe322384..e105e50c305 100644
--- a/python/sglang/srt/layers/quantization/fp8_utils.py
+++ b/python/sglang/srt/layers/quantization/fp8_utils.py
@@ -227,7 +227,7 @@ def deepgemm_w8a8_block_fp8_linear_with_fallback(
     output_dtype = input.dtype
     dtype_supported = output_dtype == torch.bfloat16
 
-    # TODO: design shape check here
+    # TODO: https://github.com/sgl-project/sglang/pull/6890#issuecomment-2943395737
     shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0
 
     if not (shape_supported and dtype_supported):