From 4871b45efcc8d9afefa161967c01e6ffb657d84c Mon Sep 17 00:00:00 2001 From: Feiyue Zhai Date: Thu, 12 Mar 2026 11:00:32 +0800 Subject: [PATCH 1/3] [PD] Add SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS env var for configurable KV transfer overlap When max_num_reqs > 32, the decode worker's req_to_token_pool pre_alloc_size was hardcoded to 0, limiting KV cache transfer concurrency. This adds an environment variable to configure extra pool slots so more KV transfers can overlap with decode execution without raising max_running_requests. Made-with: Cursor --- python/sglang/srt/environ.py | 4 ++++ .../sglang/srt/model_executor/model_runner_kv_cache_mixin.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py index e9c6481ba9f8..e8fdc7d17155 100644 --- a/python/sglang/srt/environ.py +++ b/python/sglang/srt/environ.py @@ -244,6 +244,10 @@ class Envs: SGLANG_DISAGGREGATION_WAITING_TIMEOUT = EnvInt(300) SGLANG_DISAGGREGATION_NIXL_BACKEND = EnvStr("UCX") SGLANG_DISAGGREGATION_ALL_CP_RANKS_TRANSFER = EnvBool(False) + # Extra slots in req_to_token_pool for decode workers (only effective when + # max_num_reqs > 32). Increases pool capacity so more KV cache transfers + # can overlap with decode execution without raising max_running_requests. + SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS = EnvInt(0) # Scheduler: others: SGLANG_EMPTY_CACHE_INTERVAL = EnvFloat(-1) # in seconds. Set if you observe high memory accumulation over a long serving period. diff --git a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py index 588da80342ca..0f25ab8ca81f 100644 --- a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py +++ b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py @@ -388,7 +388,9 @@ def _init_pools(self: ModelRunner): # subscribe memory for pre-allocated requests # if max_num_reqs <= 32, we pre-allocate 2x requests - pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else 0 + from sglang.srt.environ import envs + pre_alloc_size = envs.SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS.get() + pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else pre_alloc_size if config := self.mambaish_config: self.req_to_token_pool = HybridMambaDecodeReqToTokenPool( size=max_num_reqs, From 163ea06deec7843260c7e2597af5b5d7649f87cc Mon Sep 17 00:00:00 2001 From: Feiyue Zhai Date: Fri, 13 Mar 2026 10:37:59 +0800 Subject: [PATCH 2/3] fix format --- .../sglang/srt/model_executor/model_runner_kv_cache_mixin.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py index 0f25ab8ca81f..cdb142e765c9 100644 --- a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py +++ b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py @@ -389,8 +389,11 @@ def _init_pools(self: ModelRunner): # subscribe memory for pre-allocated requests # if max_num_reqs <= 32, we pre-allocate 2x requests from sglang.srt.environ import envs + pre_alloc_size = envs.SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS.get() - pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else pre_alloc_size + pre_alloc_size = ( + max_num_reqs * 2 if max_num_reqs <= 32 else pre_alloc_size + ) if config := self.mambaish_config: self.req_to_token_pool = HybridMambaDecodeReqToTokenPool( size=max_num_reqs, From a232e1c29b47b70afdbf6bd8ee0f0125dc34e7fe Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Fri, 27 Mar 2026 01:20:56 -0700 Subject: [PATCH 3/3] Address review comments --- python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py index cdb142e765c9..5b3f8515a484 100644 --- a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py +++ b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py @@ -8,6 +8,7 @@ from sglang.srt.configs.model_config import get_nsa_index_head_dim, is_deepseek_nsa from sglang.srt.distributed.parallel_state import get_world_group +from sglang.srt.environ import envs from sglang.srt.layers.dp_attention import get_attention_tp_size from sglang.srt.mem_cache.allocator import ( PagedTokenToKVPoolAllocator, @@ -388,7 +389,6 @@ def _init_pools(self: ModelRunner): # subscribe memory for pre-allocated requests # if max_num_reqs <= 32, we pre-allocate 2x requests - from sglang.srt.environ import envs pre_alloc_size = envs.SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS.get() pre_alloc_size = (