diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index bff8ca74e17e..4e436b7856c5 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -450,17 +450,16 @@ def _derive_hybrid_model(self): if not self.is_hybrid_swa: self.has_attention_sinks = self._detect_attention_sinks() self.is_hybrid_swa_compress = False - self.is_swa_with_compressed_attention = False return logger.info(f"Hybrid swa model: {self.hf_config.architectures=}") - self.is_swa_with_compressed_attention = any( + self.is_deepseek_v4_arch = any( arch in ["DeepseekV4ForCausalLM", "DeepseekV4ForCausalLMNextN"] for arch in self.hf_config.architectures ) - if self.is_hybrid_swa and not self.is_swa_with_compressed_attention: + if not self.is_deepseek_v4_arch: self.swa_attention_layer_ids, self.full_attention_layer_ids = ( get_hybrid_layer_ids( self.hf_config.architectures, diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index f080da104233..9c694321cbef 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -866,10 +866,7 @@ def init_cache_with_memory_pool(self): ) if effective_chunked_prefill_size is not None and self.disable_radix_cache: - is_v4_compressed = getattr( - self.model_config, "is_swa_with_compressed_attention", False - ) - if not self.is_hybrid_swa or is_v4_compressed: + if not self.is_hybrid_swa: from sglang.srt.mem_cache.chunk_cache import ChunkCache self.tree_cache = ChunkCache(params)