diff --git a/python/sglang/srt/models/deepseek_v4.py b/python/sglang/srt/models/deepseek_v4.py index b8df0c3135e2..16d42526b6b3 100644 --- a/python/sglang/srt/models/deepseek_v4.py +++ b/python/sglang/srt/models/deepseek_v4.py @@ -594,6 +594,7 @@ def forward( o.reshape(T * G, D).contiguous(), group_size=128, ) + o_s = deep_gemm.ceil_to_ue8m0(o_s) output = torch.empty(T, G, R, device=o.device, dtype=torch.bfloat16) deep_gemm.fp8_einsum( "bhr,hdr->bhd",