From 5508f7473284193fd623114da963672a5edf57ea Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 27 Mar 2025 20:06:31 -0700 Subject: [PATCH] Revert "gemma3: impl `get_attention_sliding_window_size` for attn init (#4823)" This reverts commit 0bc0bf57341d4b9cdd0a096ff321b5128719a983. --- python/sglang/srt/models/gemma3_causal.py | 14 ++------------ python/sglang/srt/models/gemma3_mm.py | 6 ------ 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/python/sglang/srt/models/gemma3_causal.py b/python/sglang/srt/models/gemma3_causal.py index d9e0293b76a..d892c515254 100644 --- a/python/sglang/srt/models/gemma3_causal.py +++ b/python/sglang/srt/models/gemma3_causal.py @@ -47,12 +47,6 @@ from sglang.srt.utils import add_prefix, make_layers -# Aligned with HF's implementation, using sliding window inclusive with the last token -# SGLang assumes exclusive -def get_attention_sliding_window_size(config): - return config.sliding_window - 1 - - # Adapted from: # https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/gemma3.py def extract_layer_index(prefix: str) -> int: @@ -176,7 +170,7 @@ def __init__( self.rope_scaling = {"rope_type": "default"} # FIXME(mick): idk why vllm does this # self.sliding_window = config.interleaved_sliding_window - self.sliding_window = get_attention_sliding_window_size(config) + self.sliding_window = config.sliding_window else: # Global attention. Use the values in config.json. self.rope_theta = config.rope_theta @@ -190,8 +184,6 @@ def __init__( num_kv_heads=self.num_kv_heads, layer_id=layer_id, logit_cap=getattr(self.config, "attn_logit_softcapping", None), - # Module must also define `get_attention_sliding_window_size` to correctly initialize - # attention backend in `ForwardBatch`. sliding_window_size=self.sliding_window, prefix=add_prefix("attn", prefix), ) @@ -617,9 +609,6 @@ def __init__( def get_input_embeddings(self) -> nn.Embedding: return self.model.embed_tokens - def get_attention_sliding_window_size(self): - return get_attention_sliding_window_size(self.config) - def dtype(self) -> torch.dtype: return next(self.parameters()).dtype @@ -632,6 +621,7 @@ def forward( input_embeds: torch.Tensor = None, **kwargs, ) -> LogitsProcessor: + hidden_states = self.model( input_ids, positions, forward_batch, input_embeds, **kwargs ) diff --git a/python/sglang/srt/models/gemma3_mm.py b/python/sglang/srt/models/gemma3_mm.py index 80dd7197a37..c357bf9e595 100644 --- a/python/sglang/srt/models/gemma3_mm.py +++ b/python/sglang/srt/models/gemma3_mm.py @@ -268,12 +268,6 @@ def prepare_attn_masks( def get_input_embeddings(self) -> nn.Embedding: return self.language_model.get_input_embeddings() - def get_attention_sliding_window_size(self): - """ - This value is used to initialize attention backends in `ForwardBatch`. - """ - return self.language_model.get_attention_sliding_window_size() - def get_image_feature(self, image_input: MultimodalInputs): """ Projects the last hidden state from the vision model into language model space.