From 3618f951254e895721677091410d39f7f6622cdc Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Thu, 5 Jun 2025 16:12:34 +0800 Subject: [PATCH 1/3] Update fp8_utils.py --- python/sglang/srt/layers/quantization/fp8_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py index c180c0a7799..cfba5364910 100644 --- a/python/sglang/srt/layers/quantization/fp8_utils.py +++ b/python/sglang/srt/layers/quantization/fp8_utils.py @@ -227,8 +227,9 @@ def deepgemm_w8a8_block_fp8_linear_with_fallback( output_dtype = input.dtype dtype_supported = output_dtype == torch.bfloat16 - # TODO: add more robust shape check here - shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0 + # TODO: design shape check here + # shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0 + shape_supported = True if not (shape_supported and dtype_supported): # fall back to triton From 268b267ab613635b8928aff3e14b1a21725edea6 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Thu, 5 Jun 2025 17:51:09 +0800 Subject: [PATCH 2/3] Update fp8_utils.py --- python/sglang/srt/layers/quantization/fp8_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py index cfba5364910..67efe322384 100644 --- a/python/sglang/srt/layers/quantization/fp8_utils.py +++ b/python/sglang/srt/layers/quantization/fp8_utils.py @@ -228,8 +228,7 @@ def deepgemm_w8a8_block_fp8_linear_with_fallback( dtype_supported = output_dtype == torch.bfloat16 # TODO: design shape check here - # shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0 - shape_supported = True + shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0 if not (shape_supported and dtype_supported): # fall back to triton From 74508e992d9ade0e73fb2a67990dc3e7402d1867 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Thu, 5 Jun 2025 17:51:46 +0800 Subject: [PATCH 3/3] Update fp8_utils.py --- python/sglang/srt/layers/quantization/fp8_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py index 67efe322384..e105e50c305 100644 --- a/python/sglang/srt/layers/quantization/fp8_utils.py +++ b/python/sglang/srt/layers/quantization/fp8_utils.py @@ -227,7 +227,7 @@ def deepgemm_w8a8_block_fp8_linear_with_fallback( output_dtype = input.dtype dtype_supported = output_dtype == torch.bfloat16 - # TODO: design shape check here + # TODO: https://github.com/sgl-project/sglang/pull/6890#issuecomment-2943395737 shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0 if not (shape_supported and dtype_supported):