Skip to content
8 changes: 7 additions & 1 deletion python/sglang/srt/managers/schedule_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2515,8 +2515,14 @@ def _evict_swa(self, req: Req, pre_len: int):
), "cache_protected_len must be page aligned"
req.swa_evicted_seqlen = max(req.swa_evicted_seqlen, req.cache_protected_len)

# Subtract an extra page_size so the eviction frontier never reaches the
# radix tree insert boundary (page_floor(seq_len)). This keeps at least one
# page of non-evicted SWA KV for the tree to store as a non-tombstone node,
# preserving cache reuse in multi-turn scenarios.
# See also: _insert_helper case 3 in swa_radix_cache.py (defensive counterpart).
new_swa_evicted_seqlen = max(
req.swa_evicted_seqlen, pre_len - sliding_window_size
req.swa_evicted_seqlen,
pre_len - sliding_window_size - self.tree_cache.page_size,
)

if self.tree_cache.page_size > 1:
Expand Down
20 changes: 20 additions & 0 deletions python/sglang/srt/mem_cache/swa_radix_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -1013,6 +1013,26 @@ def _insert_helper(
child_key = self.get_child_key_fn(key)

if len(key):
# Layout: |--- total_prefix_length ---|--- len(key) ---|
# ^ ^ ^
# 0 total_prefix_length total_length
#
# Cases based on swa_evicted_seqlen position:
# 1. swa_evicted_seqlen <= total_prefix_length:
# Already handled in the while loop above. All of len(key) is non-tombstone.
# 2. total_prefix_length < swa_evicted_seqlen < total_length:
# Split: [total_prefix_length, swa_evicted_seqlen) as tombstone,
# [swa_evicted_seqlen, total_length) as non-tombstone.
# 3. swa_evicted_seqlen == total_length:
# All remaining tokens are evicted. Free value and return without
# creating a node (leaf nodes must not be tombstone).
# Note: the -page_size fix in _evict_swa prevents this case from
# occurring in normal operation. This check is a defensive guard
# against unexpected eviction states from other code paths.
if swa_evicted_seqlen == total_prefix_length + len(key):
self.token_to_kv_pool_allocator.free(value)
return total_prefix_length

if (
swa_evicted_seqlen > total_prefix_length
and swa_evicted_seqlen < total_prefix_length + len(key)
Expand Down
Loading
Loading