diff --git a/python/sglang/srt/layers/quantization/marlin_utils_fp8.py b/python/sglang/srt/layers/quantization/marlin_utils_fp8.py index 94326d71e54d..1e8e85be0131 100644 --- a/python/sglang/srt/layers/quantization/marlin_utils_fp8.py +++ b/python/sglang/srt/layers/quantization/marlin_utils_fp8.py @@ -62,7 +62,6 @@ def apply_fp8_marlin_linear( a=reshaped_x, c=None, b_q_weight=weight, - b_bias=bias, b_scales=weight_scale, global_scale=None, b_zeros=None, @@ -77,6 +76,9 @@ def apply_fp8_marlin_linear( use_fp32_reduce=use_fp32_reduce, ) + if bias is not None: + output.add_(bias) + return output.reshape(out_shape)