From 523214a23840765b560c84ab4a56e1bdd3b78bcf Mon Sep 17 00:00:00 2001
From: Liu-congo <1502632128@qq.com>
Date: Tue, 14 Oct 2025 17:53:33 +0800
Subject: [PATCH 1/2] fix the fp8 quant method, deprecated input_to_fp8 in dsv2

Signed-off-by: Liu-congo <1502632128@qq.com>
---
 python/sglang/srt/models/deepseek_v2.py | 33 ++++++++++++-------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
index 454e08585e9..2ef8b4c576b 100644
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -93,7 +93,6 @@
     block_quant_dequant,
     block_quant_to_tensor_quant,
     channel_quant_to_tensor_quant,
-    input_to_float8,
     normalize_e4m3fn_to_e4m3fnuz,
     requant_weight_ue8m0_inplace,
 )
@@ -1576,14 +1575,14 @@ def forward_absorb_prepare(
                 )
         elif self.w_kc.dtype == torch.float8_e4m3fn:
             # TODO fix the per_tensor_quant_mla_fp8 for cublas 12.9
-            if _is_cublas_ge_129:
-                q_nope_val, q_nope_scale = input_to_float8(
-                    q_nope.transpose(0, 1), torch.float8_e4m3fn
-                )
-            else:
-                q_nope_val, q_nope_scale = per_tensor_quant_mla_fp8(
-                    q_nope.transpose(0, 1), zero_allocator.allocate(1)
-                )
+            q_nope_val, q_nope_scale = per_tensor_quant_mla_fp8(
+                q_nope.transpose(0, 1),
+                (
+                    torch.zeros((1,), dtype=torch.float32, device=q_nope.device)
+                    if _is_cublas_ge_129
+                    else zero_allocator.allocate(1)
+                ),
+            )
             q_nope_out = bmm_fp8(
                 q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
             )
@@ -1724,14 +1723,14 @@ def forward_absorb_core(
                 attn_bmm_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
 
         elif self.w_vc.dtype == torch.float8_e4m3fn:
-            if _is_cublas_ge_129:
-                attn_output_val, attn_output_scale = input_to_float8(
-                    attn_output.transpose(0, 1), torch.float8_e4m3fn
-                )
-            else:
-                attn_output_val, attn_output_scale = per_tensor_quant_mla_fp8(
-                    attn_output.transpose(0, 1), zero_allocator.allocate(1)
-                )
+            attn_output_val, attn_output_scale = per_tensor_quant_mla_fp8(
+                attn_output.transpose(0, 1),
+                (
+                    torch.zeros((1,), dtype=torch.float32, device=attn_output.device)
+                    if _is_cublas_ge_129
+                    else zero_allocator.allocate(1)
+                ),
+            )
             attn_bmm_output = bmm_fp8(
                 attn_output_val,
                 self.w_vc,

From 81fd5bc8c870767ec4e909ff2e1bac9e98f41fe7 Mon Sep 17 00:00:00 2001
From: Liu-congo <1502632128@qq.com>
Date: Wed, 15 Oct 2025 08:09:46 +0800
Subject: [PATCH 2/2] add comment to explain the modification

Signed-off-by: Liu-congo <1502632128@qq.com>
---
 python/sglang/srt/models/deepseek_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
index 2ef8b4c576b..8550afe538f 100644
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -1574,7 +1574,7 @@ def forward_absorb_prepare(
                     self.w_kc.to(torch.bfloat16) * self.w_scale,
                 )
         elif self.w_kc.dtype == torch.float8_e4m3fn:
-            # TODO fix the per_tensor_quant_mla_fp8 for cublas 12.9
+            # fix bmm_fp8 error under cublas12.9 caused by bumpallocator, detail in pr#11612
             q_nope_val, q_nope_scale = per_tensor_quant_mla_fp8(
                 q_nope.transpose(0, 1),
                 (