sgl-project · hnyls2002 · Apr 10, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
@@ -2515,8 +2515,14 @@ def _evict_swa(self, req: Req, pre_len: int):
         ), "cache_protected_len must be page aligned"
         req.swa_evicted_seqlen = max(req.swa_evicted_seqlen, req.cache_protected_len)
 
+        # Subtract an extra page_size so the eviction frontier never reaches the
+        # radix tree insert boundary (page_floor(seq_len)). This keeps at least one
+        # page of non-evicted SWA KV for the tree to store as a non-tombstone node,
+        # preserving cache reuse in multi-turn scenarios.
+        # See also: _insert_helper case 3 in swa_radix_cache.py (defensive counterpart).
         new_swa_evicted_seqlen = max(
-            req.swa_evicted_seqlen, pre_len - sliding_window_size
+            req.swa_evicted_seqlen,
+            pre_len - sliding_window_size - self.tree_cache.page_size,
         )
 
         if self.tree_cache.page_size > 1:

@@ -1013,6 +1013,26 @@ def _insert_helper(
                 child_key = self.get_child_key_fn(key)
 
         if len(key):
+            # Layout: |--- total_prefix_length ---|--- len(key) ---|
+            #         ^                           ^                ^
+            #         0              total_prefix_length     total_length
+            #
+            # Cases based on swa_evicted_seqlen position:
+            # 1. swa_evicted_seqlen <= total_prefix_length:
+            #    Already handled in the while loop above. All of len(key) is non-tombstone.
+            # 2. total_prefix_length < swa_evicted_seqlen < total_length:
+            #    Split: [total_prefix_length, swa_evicted_seqlen) as tombstone,
+            #           [swa_evicted_seqlen, total_length) as non-tombstone.
+            # 3. swa_evicted_seqlen == total_length:
+            #    All remaining tokens are evicted. Free value and return without
+            #    creating a node (leaf nodes must not be tombstone).
+            #    Note: the -page_size fix in _evict_swa prevents this case from
+            #    occurring in normal operation. This check is a defensive guard
+            #    against unexpected eviction states from other code paths.
+            if swa_evicted_seqlen == total_prefix_length + len(key):
+                self.token_to_kv_pool_allocator.free(value)
+                return total_prefix_length
+
             if (
                 swa_evicted_seqlen > total_prefix_length
                 and swa_evicted_seqlen < total_prefix_length + len(key)