diff --git a/python/sglang/srt/layers/attention/wave_backend.py b/python/sglang/srt/layers/attention/wave_backend.py index 18c193ade493..2e01c554ce92 100644 --- a/python/sglang/srt/layers/attention/wave_backend.py +++ b/python/sglang/srt/layers/attention/wave_backend.py @@ -7,7 +7,7 @@ import triton import triton.language as tl -from sglang.srt.layers.attention import AttentionBackend +from sglang.srt.layers.attention.base_attn_backend import AttentionBackend from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton from sglang.srt.layers.dp_attention import get_attention_tp_size from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode @@ -88,7 +88,7 @@ class WaveAttnBackend(AttentionBackend): def __init__( self, model_runner: ModelRunner, - skip_prefill: bool, + skip_prefill: bool = False, kv_indptr_buf: Optional[torch.Tensor] = None, ): # Lazy import to avoid the initialization of cuda context diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index 7e668fb956ef..31b6d09dec23 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -860,7 +860,7 @@ def process_weights_hip_int4(self, layer: Module): layer.w13_weight_scale1[expert_id] *= max_w13_scales[expert_id] layer.w2_weight_scale1[expert_id] *= layer.w2_weight_scale[expert_id] - def process_weights_hip_scale_padding(self, layer: Module, padding_size: int): + def process_weights_hip_scale_padding(self, layer: Module, padding_size: int=0): from sglang.srt.layers.moe.fused_moe_triton.fused_moe import ( padding_size, # Avoid circular import )