sgl-project · HaiShaw · May 18, 2026 · May 18, 2026
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -450,17 +450,16 @@ def _derive_hybrid_model(self):
         if not self.is_hybrid_swa:
             self.has_attention_sinks = self._detect_attention_sinks()
             self.is_hybrid_swa_compress = False
-            self.is_swa_with_compressed_attention = False
             return
 
         logger.info(f"Hybrid swa model: {self.hf_config.architectures=}")
 
-        self.is_swa_with_compressed_attention = any(
+        self.is_deepseek_v4_arch = any(
             arch in ["DeepseekV4ForCausalLM", "DeepseekV4ForCausalLMNextN"]
             for arch in self.hf_config.architectures
         )
 
-        if self.is_hybrid_swa and not self.is_swa_with_compressed_attention:
+        if not self.is_deepseek_v4_arch:
             self.swa_attention_layer_ids, self.full_attention_layer_ids = (
                 get_hybrid_layer_ids(
                     self.hf_config.architectures,

@@ -866,10 +866,7 @@ def init_cache_with_memory_pool(self):
         )
 
         if effective_chunked_prefill_size is not None and self.disable_radix_cache:
-            is_v4_compressed = getattr(
-                self.model_config, "is_swa_with_compressed_attention", False
-            )
-            if not self.is_hybrid_swa or is_v4_compressed:
+            if not self.is_hybrid_swa:
                 from sglang.srt.mem_cache.chunk_cache import ChunkCache
 
                 self.tree_cache = ChunkCache(params)